Merge branch 'main' into users/joker-eph-python-bindings-maintainersusers/joker-eph-python-bindings-maintainers

author: Mehdi Amini <joker.eph@gmail.com> 2025-08-14 15:36:46 +0200
committer: GitHub <noreply@github.com> 2025-08-14 15:36:46 +0200
commit: df57d6a01e85ca78da2febab21b268d9fd6955a0 (patch)
tree: 19b0aab453e6bc7e2b15d3220024dfdacd4fa57e
parent: df86ea61b7ed484ca797f96d7ad40fd9ada7ba30 (diff)
parent: 7bda76367f19cfc19086f68d9dd5ac019a9ceccd (diff)
download: llvm-users/joker-eph-python-bindings-maintainers.zip
llvm-users/joker-eph-python-bindings-maintainers.tar.gz
llvm-users/joker-eph-python-bindings-maintainers.tar.bz2
653 files changed, 22168 insertions, 12478 deletions
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index f76c69f..55a269c 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -25,7 +25,7 @@ jobs:
       container-filename: ${{ steps.vars.outputs.container-filename }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: .github/workflows/containers/github-action-ci-windows
       - name: Write Variables
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 7f01264..3e91c49 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -30,7 +30,7 @@ jobs:
             runs-on: depot-ubuntu-24.04-arm-16
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: .github/workflows/containers/github-action-ci/
       # podman is not installed by default on the ARM64 images.
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index af4d599..265fd73 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -27,7 +27,7 @@ jobs:
       container-filename: ${{ steps.vars.outputs.container-filename }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: .ci/metrics/
       - name: Write Variables
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index ec2615d..7e8c156 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -22,7 +22,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: .ci
       - name: Setup Python
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index b807485..7d37b90 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -41,7 +41,7 @@ jobs:
       LLVM_VERSION: 18
     steps:
       - name: Checkout Source
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
diff --git a/.github/workflows/commit-access-greeter.yml b/.github/workflows/commit-access-greeter.yml
index a5fbbbb..f31cd01 100644
--- a/.github/workflows/commit-access-greeter.yml
+++ b/.github/workflows/commit-access-greeter.yml
@@ -18,7 +18,7 @@ jobs:
       github.event.label.name == 'infra:commit-access-request'
     runs-on: ubuntu-24.04
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/git/
 
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index d401a13..a7be81b 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
index c06fcc0..640d34d 100644
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -90,7 +90,7 @@ RUN powershell -Command \
 RUN git config --system core.longpaths true & \
     git config --global core.autocrlf false
 
-ARG RUNNER_VERSION=2.327.1
+ARG RUNNER_VERSION=2.328.0
 ENV RUNNER_VERSION=$RUNNER_VERSION
 
 RUN powershell -Command \
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 4b0d5e2..2274960 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -97,7 +97,7 @@ WORKDIR /home/gha
 
 FROM ci-container as ci-container-agent
 
-ENV GITHUB_RUNNER_VERSION=2.327.1
+ENV GITHUB_RUNNER_VERSION=2.328.0
 
 RUN mkdir actions-runner && \
     cd actions-runner && \
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 3970271..b627803 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -55,7 +55,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Get subprojects that have doc changes
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 904ad71..9390fba 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: ${{ github.event.pull_request.head.sha }}
 
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index b6530fe..72cbbe2 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -29,25 +29,25 @@ jobs:
     runs-on: ${{ inputs.SKU }}
     steps:
       - name: Checkout DXC
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: Microsoft/DirectXShaderCompiler
           ref: main
           path: DXC
           submodules: true
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: ${{ inputs.LLVM-branch }}
           path: llvm-project
       - name: Checkout OffloadTest
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: llvm/offload-test-suite
           ref: main
           path: OffloadTest
       - name: Checkout Golden Images
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: llvm/offload-golden-images
           ref: main
diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index efd0459..7fd0280 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -42,7 +42,7 @@ jobs:
       contains(github.event.action == 'opened' && github.event.issue.body || github.event.comment.body, '/cherry-pick')
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: llvm/llvm-project
           # GitHub stores the token used for checkout and uses it for pushes
diff --git a/.github/workflows/issue-subscriber.yml b/.github/workflows/issue-subscriber.yml
index de1c45c..afcd17c 100644
--- a/.github/workflows/issue-subscriber.yml
+++ b/.github/workflows/issue-subscriber.yml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index a2c4f58..3036582 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -25,7 +25,7 @@ jobs:
       )
     steps:
       - name: Fetch Sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: |
             .github/workflows/unprivileged-download-artifact/action.yml
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 9ba77ff..8967cd0 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -52,7 +52,7 @@ jobs:
           # - c_compiler: gcc
           #   cpp_compiler: g++
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
     
     # Libc's build is relatively small comparing with other components of LLVM.
     # A fresh fullbuild takes about 190MiB of uncompressed disk space, which can
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index e3dc416..7154946 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -41,7 +41,7 @@ jobs:
               cpp_compiler: clang++
     
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
     
     # Libc's build is relatively small comparing with other components of LLVM.
     # A fresh linux overlay takes about 180MiB of uncompressed disk space, which can
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 4d47c07..3836cc5 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -38,7 +38,7 @@ jobs:
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
     steps:
       - name: Checkout source
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 250
 
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 280f7ca..2e6ff7f 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -54,7 +54,7 @@ jobs:
             cc: 'gcc-15'
             cxx: 'g++-15'
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: ${{ matrix.config }}.${{ matrix.cxx }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -99,7 +99,7 @@ jobs:
             cc: 'clang-20'
             cxx: 'clang++-20'
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -163,7 +163,7 @@ jobs:
           machine: llvm-premerge-libcxx-next-runners
     runs-on: ${{ matrix.machine }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -211,7 +211,7 @@ jobs:
           os: macos-15
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
         with:
           # https://github.com/actions/runner-images/blob/main/images/macos/macos-15-Readme.md
@@ -252,7 +252,7 @@ jobs:
         - { config: mingw-dll-i686, mingw: true }
         - { config: mingw-incomplete-sysroot, mingw: true }
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Install dependencies
         run: |
           choco install -y ninja
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index 43c446a..c87ee8e 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -30,7 +30,7 @@ jobs:
       packages: write
 
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
     - name: Build the Linux builder image
       working-directory: libcxx/utils/ci
diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml
index 0226edd..f338bd6 100644
--- a/.github/workflows/libcxx-check-generated-files.yml
+++ b/.github/workflows/libcxx-check-generated-files.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index d40ed5b..8621a3b 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -86,7 +86,7 @@ jobs:
       # actions/checkout deletes any existing files in the new git directory,
       # so this needs to either run before ccache-action or it has to use
       # clean: false.
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 250
       - name: Setup ccache
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index a9bd8db..52b486e 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -38,7 +38,7 @@ jobs:
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
     steps:
       - name: Checkout source
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 250
 
diff --git a/.github/workflows/merged-prs.yml b/.github/workflows/merged-prs.yml
index c771736..107bbc5 100644
--- a/.github/workflows/merged-prs.yml
+++ b/.github/workflows/merged-prs.yml
@@ -21,7 +21,7 @@ jobs:
       (github.event.pull_request.merged == true)
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml
index 935598e..e1f2e75 100644
--- a/.github/workflows/new-prs.yml
+++ b/.github/workflows/new-prs.yml
@@ -35,7 +35,7 @@ jobs:
       (github.event.pull_request.author_association != 'OWNER')
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 70bcaaf..dc55b7e 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -19,7 +19,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
 
@@ -35,7 +35,7 @@ jobs:
       # We need to pull the script from the main branch, so that we ensure
       # we get the latest version of this script.
       - name: Fetch code formatting utils
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: ${{ github.repository }}
           ref: ${{ github.base_ref }}
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index 57425e0..f0197d7 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -19,7 +19,7 @@ jobs:
       # We need to pull the script from the main branch, so that we ensure
       # we get the latest version of this script.
       - name: Checkout Scripts
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: |
             llvm/utils/git/requirements.txt
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
index f558da8..23c7a67 100644
--- a/.github/workflows/pr-subscriber.yml
+++ b/.github/workflows/pr-subscriber.yml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 6e59841..8ac57ec 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -31,7 +31,7 @@ jobs:
     runs-on: llvm-premerge-linux-runners
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Build and Test
@@ -88,7 +88,7 @@ jobs:
         shell: bash
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Compute Projects
@@ -132,7 +132,7 @@ jobs:
       (github.event_name != 'pull_request' || github.event.action != 'closed')
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Setup ccache
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index 7a1f232..6546540 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -23,7 +23,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: |
             .github/workflows/release-asset-audit.py
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index c113b42d..116bdfb3 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -73,7 +73,7 @@ jobs:
         python-version: '3.12'
 
     - name: Checkout LLVM
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
     - name: Install Dependencies
       shell: bash
@@ -195,7 +195,7 @@ jobs:
     steps:
 
     - name: Checkout Actions
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       with:
         ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
         sparse-checkout: |
@@ -216,7 +216,7 @@ jobs:
       run: mv workflows  ../workflows-main
 
     - name: Checkout LLVM
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       with:
         ref: ${{ needs.prepare.outputs.ref }}
 
@@ -286,7 +286,7 @@ jobs:
 
     steps:
     - name: Checkout Release Scripts
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       with:
         sparse-checkout: |
           llvm/utils/release/github-upload-release.py
@@ -338,7 +338,7 @@ jobs:
     runs-on: ${{ needs.prepare.outputs.test-runs-on }}
     steps:
     - name: Checkout Actions
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       with:
         ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
         sparse-checkout: |
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 5a0aa06..712ff18 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -34,7 +34,7 @@ jobs:
       upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
@@ -66,7 +66,7 @@ jobs:
 
       - name: Clone www-releases
         if: env.upload
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: ${{ github.repository_owner }}/www-releases
           ref: main
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index d47c433..17c6774 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -40,7 +40,7 @@ jobs:
       upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
index 9adeffb..60ec644 100644
--- a/.github/workflows/release-lit.yml
+++ b/.github/workflows/release-lit.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: "llvmorg-${{ inputs.release-version }}"
 
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 9943891..14cc4c4 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -71,7 +71,7 @@ jobs:
       attestations: write
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: ${{ needs.inputs.outputs.ref }}
           fetch-tags: true
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index c9ae7e1..a184996 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -38,7 +38,7 @@ jobs:
           sudo apt-get install python3-github
 
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Create Release
         env:
@@ -129,7 +129,7 @@ jobs:
           sudo apt-get install python3-github
 
       - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           sparse-checkout: llvm/utils/release/github-upload-release.py
           sparse-checkout-cone-mode: false
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 6cc80fb..40db550 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           persist-credentials: false
 
diff --git a/.github/workflows/version-check.yml b/.github/workflows/version-check.yml
index a0a5980..7e45188 100644
--- a/.github/workflows/version-check.yml
+++ b/.github/workflows/version-check.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 0
 
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index 97dfd0f..df3a8b2 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -162,14 +162,19 @@ in .clang-tidy file, if any.
                                    cl::init(false), cl::cat(ClangTidyCategory));
 
 static cl::opt<std::string> LineFilter("line-filter", desc(R"(
-List of files with line ranges to filter the
-warnings. Can be used together with
--header-filter. The format of the list is a
-JSON array of objects:
+List of files and line ranges to output diagnostics from.
+The range is inclusive on both ends. Can be used together
+with -header-filter. The format of the list is a JSON
+array of objects. For example:
+
   [
     {"name":"file1.cpp","lines":[[1,3],[5,7]]},
     {"name":"file2.h"}
   ]
+
+This will output diagnostics from 'file1.cpp' only for
+the line ranges [1,3] and [5,7], as well as all from the
+entire 'file2.h'.
 )"),
                                        cl::init(""),
                                        cl::cat(ClangTidyCategory));
diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 80f1766..670e0a2 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -483,7 +483,7 @@ async def main() -> None:
     parser.add_argument(
         "-line-filter",
         default=None,
-        help="List of files with line ranges to filter the warnings.",
+        help="List of files and line ranges to output diagnostics from.",
     )
     if yaml:
         parser.add_argument(
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 187aae2..b481c56 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -109,6 +109,9 @@ Improvements to clang-tidy
   `enable-check-profile` to enable per-check timing profiles and print a
   report based on all analyzed files.
 
+- Improved documentation of the `-line-filter` command-line flag of
+  :program:`clang-tidy` and :program:`run-clang-tidy.py`.
+
 New checks
 ^^^^^^^^^^
 
diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst
index b7a366e..e8ce903 100644
--- a/clang-tools-extra/docs/clang-tidy/index.rst
+++ b/clang-tools-extra/docs/clang-tidy/index.rst
@@ -213,14 +213,19 @@ An overview of all the command-line options:
                                        Can be used together with -line-filter.
                                        This option overrides the 'HeaderFilterRegex'
                                        option in .clang-tidy file, if any.
-    --line-filter=<string>           - List of files with line ranges to filter the
-                                       warnings. Can be used together with
-                                       -header-filter. The format of the list is a
-                                       JSON array of objects:
+    --line-filter=<string>           - List of files and line ranges to output diagnostics from.
+                                       The range is inclusive on both ends. Can be used together
+                                       with -header-filter. The format of the list is a JSON
+                                       array of objects. For example:
+
                                          [
                                            {"name":"file1.cpp","lines":[[1,3],[5,7]]},
                                            {"name":"file2.h"}
                                          ]
+
+                                       This will output diagnostics from 'file1.cpp' only for
+                                       the line ranges [1,3] and [5,7], as well as all from the
+                                       entire 'file2.h'.
     --list-checks                    - List all enabled checks and exit. Use with
                                        -checks=* to list all available checks.
     --load=<pluginfilename>          - Load the specified plugin
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index eef3d0c..2db1bae 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -760,7 +760,8 @@ Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±in
 The integer elementwise intrinsics, including ``__builtin_elementwise_popcount``,
 ``__builtin_elementwise_bitreverse``, ``__builtin_elementwise_add_sat``,
 ``__builtin_elementwise_sub_sat``, ``__builtin_elementwise_max``,
-``__builtin_elementwise_min`` can be called in a ``constexpr`` context.
+``__builtin_elementwise_min``, and ``__builtin_elementwise_abs`` 
+can be called in a ``constexpr`` context.
 
 No implicit promotion of integer types takes place. The mixing of integer types
 of different sizes and signs is forbidden in binary and ternary builtins.
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 1f502db..6f86e0a 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -191,7 +191,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | teams construct on the host device                           | :good:`done`             | r371553                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                                                                      |
+| device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | pointer attachment                                           | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -338,7 +338,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | OMPT                         | new 'emi' callbacks for external monitoring interfaces       | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| OMPT                         | device tracing interface                                     | :none:`unclaimed`        |                                                                       |
+| OMPT                         | device tracing interface                                     | :none:`in progress`      | jplehr                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | task                         | 'strict' modifier for taskloop construct                     | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -376,7 +376,7 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | loop stripe transformation                                  | :good:`done`              | https://github.com/llvm/llvm-project/pull/119891                                                     |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| work distribute construct                                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
+| workdistribute construct                                    |                           | :none:`in progress`       | @skc7, @mjklemm                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | task_iteration                                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
@@ -470,12 +470,16 @@ implementation.
 | need_device_addr modifier for adjust_args clause            | :part:`partial`           | :none:`unclaimed`         | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442           |
 |                                                             |                           |                           |               https://github.com/llvm/llvm-project/pull/149586           |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Prescriptive num_threads                                    | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
+| Prescriptive num_threads                                    | :part:`In Progress`       | :none:`unclaimed`         | ro-i                                                                     |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Message and severity clauses                                | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
+| Message and severity clauses                                | :part:`In Progress`       | :none:`unclaimed`         | ro-i                                                                     |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Local clause on declare target                              | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| groupprivate directive                                      | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| variable-category on default clause                         | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Changes to omp_target_is_accessible                         | :part:`In Progress`       | :part:`In Progress`       |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index af576f8..e23b7a12 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -115,6 +115,8 @@ Non-comprehensive list of changes in this release
 -------------------------------------------------
 - Added ``__builtin_elementwise_fshl`` and ``__builtin_elementwise_fshr``.
 
+- ``__builtin_elementwise_abs`` can now be used in constant expression.
+
 - Added ``__builtin_elementwise_minnumnum`` and ``__builtin_elementwise_maxnumnum``.
 
 - Trapping UBSan (e.g. ``-fsanitize-trap=undefined``) now emits a string describing the reason for
@@ -155,6 +157,21 @@ Improvements to Clang's diagnostics
 - Fixed fix-it hint for fold expressions. Clang now correctly places the suggested right
   parenthesis when diagnosing malformed fold expressions. (#GH151787)
 
+- Fixed an issue where emitted format-signedness diagnostics were not associated with an appropriate
+  diagnostic id. Besides being incorrect from an API standpoint, this was user visible, e.g.:
+  "format specifies type 'unsigned int' but the argument has type 'int' [-Wformat]"
+  "signedness of format specifier 'u' is incompatible with 'c' [-Wformat]"
+  This was misleading, because even though -Wformat is required in order to emit the diagnostics,
+  the warning flag the user needs to concerned with here is -Wformat-signedness, which is also
+  required and is not enabled by default. With the change you'll now see:
+  "format specifies type 'unsigned int' but the argument has type 'int', which differs in signedness [-Wformat-signedness]"
+  "signedness of format specifier 'u' is incompatible with 'c' [-Wformat-signedness]"
+  and the API-visible diagnostic id will be appropriate.
+  
+- Fixed false positives in ``-Waddress-of-packed-member`` diagnostics when
+  potential misaligned members get processed before they can get discarded.
+  (#GH144729)
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -181,6 +198,9 @@ Bug Fixes to Attribute Support
 
 - ``[[nodiscard]]`` is now respected on Objective-C and Objective-C++ methods.
   (#GH141504)
+- Fixes some late parsed attributes, when applied to function definitions, not being parsed
+  in function try blocks, and some situations where parsing of the function body
+  is skipped, such as error recovery and code completion. (#GH153551)
 - Using ``[[gnu::cleanup(some_func)]]`` where some_func is annotated with
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 
@@ -194,12 +214,15 @@ Bug Fixes to C++ Support
 - Fix the dynamic_cast to final class optimization to correctly handle
   casts that are guaranteed to fail (#GH137518).
 - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190).
+- Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729)
+- Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fix incorrect name qualifiers applied to alias CTAD. (#GH136624)
 - Fixed ElaboratedTypes appearing within NestedNameSpecifier, which was not a
   legal representation. This is fixed because ElaboratedTypes don't exist anymore. (#GH43179) (#GH68670) (#GH92757)
+- Fix comment lexing of special command names (#GH152943)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 9999a30..cb942ea 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -143,7 +143,7 @@ public:
     AddrLabelDiff
   };
 
-  class LValueBase {
+  class alignas(uint64_t) LValueBase {
     typedef llvm::PointerUnion<const ValueDecl *, const Expr *, TypeInfoLValue,
                                DynamicAllocLValue>
         PtrTy;
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 84206cf..604c9cd 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1264,7 +1264,7 @@ def NondetermenisticValue : Builtin {
 
 def ElementwiseAbs : Builtin {
   let Spellings = ["__builtin_elementwise_abs"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 116341f..a7f3d37 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10423,9 +10423,10 @@ def warn_format_conversion_argument_type_mismatch : Warning<
 def warn_format_conversion_argument_type_mismatch_pedantic : Extension<
   warn_format_conversion_argument_type_mismatch.Summary>,
   InGroup<FormatPedantic>;
-def warn_format_conversion_argument_type_mismatch_signedness : Warning<
-  warn_format_conversion_argument_type_mismatch.Summary>,
-  InGroup<FormatSignedness>, DefaultIgnore;
+def warn_format_conversion_argument_type_mismatch_signedness: Warning<
+      "format specifies type %0 but the argument has %select{type|underlying "
+      "type}2 %1, which differs in signedness" >
+    , InGroup<FormatSignedness>, DefaultIgnore;
 def warn_format_conversion_argument_type_mismatch_confusion : Warning<
   warn_format_conversion_argument_type_mismatch.Summary>,
   InGroup<FormatTypeConfusion>, DefaultIgnore;
@@ -10537,8 +10538,10 @@ def warn_format_cmp_sensitivity_mismatch : Warning<
   "it should be %select{unspecified|private|public|sensitive}1">, InGroup<Format>;
 def warn_format_cmp_specifier_mismatch : Warning<
   "format specifier '%0' is incompatible with '%1'">, InGroup<Format>;
-def warn_format_cmp_specifier_sign_mismatch : Warning<
-  "signedness of format specifier '%0' is incompatible with '%1'">, InGroup<Format>;
+def warn_format_cmp_specifier_sign_mismatch
+    : Warning<"signedness of format specifier '%0' is incompatible with '%1'">,
+      InGroup<FormatSignedness>,
+      DefaultIgnore;
 def warn_format_cmp_specifier_mismatch_pedantic : Extension<
   warn_format_cmp_specifier_sign_mismatch.Summary>, InGroup<FormatPedantic>;
 def note_format_cmp_with : Note<
@@ -13529,7 +13532,7 @@ def err_acc_invalid_modifier
 def err_acc_invalid_default_type
     : Error<"invalid value %0 in '%1' clause; valid values are %2">;
 def err_acc_device_type_multiple_archs
-    : Error<"OpenACC 'device_type' clause on a 'set' construct only permits "
+    : Error<"OpenACC 'device_type' clause on a '%0' construct only permits "
             "one architecture">;
 def warn_acc_var_referenced_non_const_array
     : Warning<"variable of array type %0 referenced in OpenACC '%1' clause "
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index c58e3f2..0f6cd00 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -155,6 +155,7 @@ FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtr
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
 FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_signed_block_descriptors, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
 FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index fb6dddf..2b92025 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -23,6 +23,10 @@
 
 namespace clang {
 
+/// Constant discriminator to be used with block descriptor pointers. The value
+/// is ptrauth_string_discriminator("block_descriptor")
+constexpr uint16_t BlockDescriptorConstantDiscriminator = 0xC0BB;
+
 /// Constant discriminator to be used with function pointers in .init_array and
 /// .fini_array. The value is ptrauth_string_discriminator("init_fini")
 constexpr uint16_t InitFiniPointerConstantDiscriminator = 0xD9D4;
@@ -223,6 +227,18 @@ struct PointerAuthOptions {
   /// The ABI for function addresses in .init_array and .fini_array
   PointerAuthSchema InitFiniPointers;
 
+  /// The ABI for block invocation function pointers.
+  PointerAuthSchema BlockInvocationFunctionPointers;
+
+  /// The ABI for block object copy/destroy function pointers.
+  PointerAuthSchema BlockHelperFunctionPointers;
+
+  /// The ABI for __block variable copy/destroy function pointers.
+  PointerAuthSchema BlockByrefHelperFunctionPointers;
+
+  /// The ABI for pointers to block descriptors.
+  PointerAuthSchema BlockDescriptorPointers;
+
   /// The ABI for Objective-C method lists.
   PointerAuthSchema ObjCMethodListFunctionPointers;
 
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index 1b133dd..d84f359 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -95,10 +95,20 @@ inline bool isStringLiteral(TokenKind K) {
 /// Return true if this is a "literal" kind, like a numeric
 /// constant, string, etc.
 inline bool isLiteral(TokenKind K) {
-  return K == tok::numeric_constant || K == tok::char_constant ||
-         K == tok::wide_char_constant || K == tok::utf8_char_constant ||
-         K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
-         isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
+  const bool isInLiteralRange =
+      K >= tok::numeric_constant && K <= tok::utf32_string_literal;
+
+#if !NDEBUG
+  const bool isLiteralExplicit =
+      K == tok::numeric_constant || K == tok::char_constant ||
+      K == tok::wide_char_constant || K == tok::utf8_char_constant ||
+      K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
+      isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
+  assert(isInLiteralRange == isLiteralExplicit &&
+         "TokenKind literals should be contiguous");
+#endif
+
+  return isInLiteralRange;
 }
 
 /// Return true if this is any of tok::annot_* kinds.
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index c491eb0..a4eb92e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -21,23 +21,21 @@ let SVETargetGuard = InvalidMode in {
 // Loads
 
 multiclass ZALoad<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
-                          MemEltTyDefault, i_prefix # "_horiz", ch>;
-
-    def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
-                               MemEltTyDefault, i_prefix # "_horiz", ch>;
-
-    def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
-                          MemEltTyDefault, i_prefix # "_vert", ch>;
-
-    def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
-                               MemEltTyDefault, i_prefix # "_vert", ch>;
-  }
+  def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t,
+                        [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
+                        MemEltTyDefault, i_prefix # "_horiz", ch>;
+
+  def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t,
+                             [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
+                             MemEltTyDefault, i_prefix # "_horiz", ch>;
+
+  def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t,
+                        [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
+                        MemEltTyDefault, i_prefix # "_vert", ch>;
+
+  def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t,
+                             [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
+                             MemEltTyDefault, i_prefix # "_vert", ch>;
 }
 
 defm SVLD1_ZA8 : ZALoad<"za8", "c", "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0_0>]>;
@@ -46,7 +44,6 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0
 defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-let SMETargetGuard = "sme" in {
 def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr">;
@@ -54,29 +51,26 @@ def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
 def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr", []>;
-}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
 
 multiclass ZAStore<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
-                          MemEltTyDefault, i_prefix # "_horiz", ch>;
-
-    def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
-                               MemEltTyDefault, i_prefix # "_horiz", ch>;
-
-    def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
-                          MemEltTyDefault, i_prefix # "_vert", ch>;
-
-    def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
-                               MemEltTyDefault, i_prefix # "_vert", ch>;
-  }
+  def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t,
+                        [IsStore, IsOverloadNone, IsStreaming, IsInZA],
+                        MemEltTyDefault, i_prefix # "_horiz", ch>;
+
+  def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t,
+                             [IsStore, IsOverloadNone, IsStreaming, IsInZA],
+                             MemEltTyDefault, i_prefix # "_horiz", ch>;
+
+  def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t,
+                        [IsStore, IsOverloadNone, IsStreaming, IsInZA],
+                        MemEltTyDefault, i_prefix # "_vert", ch>;
+
+  def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t,
+                             [IsStore, IsOverloadNone, IsStreaming, IsInZA],
+                             MemEltTyDefault, i_prefix # "_vert", ch>;
 }
 
 defm SVST1_ZA8 : ZAStore<"za8", "c", "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0_0>]>;
@@ -85,7 +79,6 @@ defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck
 defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>;
 defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-let SMETargetGuard = "sme" in {
 def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
                           [IsOverloadNone, IsStreamingCompatible, IsInZA],
                           MemEltTyDefault, "aarch64_sme_str">;
@@ -93,21 +86,18 @@ def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
 def SVSTR_ZA : MInst<"svstr_za", "vm%", "",
                       [IsOverloadNone, IsStreamingCompatible, IsInZA],
                       MemEltTyDefault, "aarch64_sme_str", []>;
-}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Read horizontal/vertical ZA slices
 
 multiclass ZARead<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t,
-                          MergeOp1, i_prefix # "_horiz",
-                          [IsReadZA, IsStreaming, IsInZA], ch>;
-
-    def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t,
-                          MergeOp1, i_prefix # "_vert",
-                          [IsReadZA, IsStreaming, IsInZA], ch>;
-  }
+  def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t,
+                        MergeOp1, i_prefix # "_horiz",
+                        [IsReadZA, IsStreaming, IsInZA], ch>;
+
+  def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t,
+                        MergeOp1, i_prefix # "_vert",
+                        [IsReadZA, IsStreaming, IsInZA], ch>;
 }
 
 defm SVREAD_ZA8 : ZARead<"za8", "cUcm", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_0>]>;
@@ -120,15 +110,13 @@ defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlmhbfd", "aarch64_sme_readq", [I
 // Write horizontal/vertical ZA slices
 
 multiclass ZAWrite<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t,
-                          MergeOp1, i_prefix # "_horiz",
-                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
-
-    def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t,
-                          MergeOp1, i_prefix # "_vert",
-                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
-  }
+  def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t,
+                        MergeOp1, i_prefix # "_horiz",
+                        [IsWriteZA, IsStreaming, IsInOutZA], ch>;
+
+  def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t,
+                        MergeOp1, i_prefix # "_vert",
+                        [IsWriteZA, IsStreaming, IsInOutZA], ch>;
 }
 
 defm SVWRITE_ZA8 : ZAWrite<"za8", "cUcm", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>;
@@ -140,13 +128,11 @@ defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlmhbfd", "aarch64_sme_writeq",
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Zero
 
-let SMETargetGuard = "sme" in {
-  def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
-                             [ImmCheck<0, ImmCheck0_255>]>;
-  def SVZERO_ZA      : SInst<"svzero_za", "vv", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
-}
+def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
+                           [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
+                           [ImmCheck<0, ImmCheck0_255>]>;
+def SVZERO_ZA      : SInst<"svzero_za", "vv", "", MergeNone, "aarch64_sme_zero",
+                           [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 
 let SMETargetGuard = "sme2p1" in {
   def SVZERO_ZA64_VG1x2 : SInst<"svzero_za64_vg1x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x2",
@@ -171,11 +157,9 @@ let SMETargetGuard = "sme2p1" in {
 // SME - Counting elements in a streaming vector
 
 multiclass ZACount<string n_suffix> {
-  let SMETargetGuard = "sme" in {
-    def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
-                      "aarch64_sme_" # n_suffix,
-                      [IsOverloadNone, IsStreamingCompatible]>;
-  }
+  def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
+                    "aarch64_sme_" # n_suffix,
+                    [IsOverloadNone, IsStreamingCompatible]>;
 }
 
 defm SVCNTSB : ZACount<"cntsb">;
@@ -187,11 +171,9 @@ defm SVCNTSD : ZACount<"cntsd">;
 // SME - ADDHA/ADDVA
 
 multiclass ZAAdd<string n_suffix> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1,
-                      "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
-                      [ImmCheck<0, ImmCheck0_3>]>;
-  }
+  def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1,
+                    "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
+                    [ImmCheck<0, ImmCheck0_3>]>;
 
   let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1,
@@ -207,13 +189,11 @@ defm SVADDVA : ZAAdd<"addva">;
 // SME - SMOPA, SMOPS, UMOPA, UMOPS
 
 multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]",
-                              "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c",
-                              MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsInOutZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
-  }
+  def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]",
+                            "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c",
+                            MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_3>]>;
 
   let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]",
@@ -233,14 +213,12 @@ defm SVUMOPS : ZAIntOuterProd<"u", "mops">;
 // SME - SUMOPA, SUMOPS, USMOPA, USMOPS
 
 multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _ZA32_B: SInst<"sv" # n_suffix1 # n_suffix2 # "_za32[_{d}]",
-                              "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
-                              !cond(!eq(n_suffix1, "su") : "", true: "U") # "c",
-                              MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsInOutZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
-  }
+  def NAME # _ZA32_B: SInst<"sv" # n_suffix1 # n_suffix2 # "_za32[_{d}]",
+                            "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
+                            !cond(!eq(n_suffix1, "su") : "", true: "U") # "c",
+                            MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_3>]>;
 
   let SMETargetGuard = "sme-i16i64" in {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix1 # n_suffix2 # "_za64[_{d}]",
@@ -261,22 +239,20 @@ defm SVUSMOPS : ZAIntOuterProdMixedSigns<"us", "mops">;
 // SME - FMOPA, FMOPS
 
 multiclass ZAFPOuterProd<string n_suffix> {
-  let SMETargetGuard = "sme" in {
-    def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h",
-                              MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsInOutZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
+  def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h",
+                            MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_3>]>;
 
-    def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b",
-                              MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsInOutZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
+  def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b",
+                            MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_3>]>;
 
-    def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f",
-                              MergeOp1, "aarch64_sme_" # n_suffix,
-                              [IsStreaming, IsInOutZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
-  }
+  def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f",
+                            MergeOp1, "aarch64_sme_" # n_suffix,
+                            [IsStreaming, IsInOutZA],
+                            [ImmCheck<0, ImmCheck0_3>]>;
 
   let SMETargetGuard = "sme-f64f64" in {
     def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d",
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 7513a3e..9ba07d8 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -36,7 +36,7 @@ def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl",            [IsLoa
 def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl",               [IsLoad, VerifyRuntimeMode],               MemEltTyInt32,   "aarch64_sve_ld1">;
 def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl",               [IsLoad, IsZExtReturn, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_ld1">;
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 // Load one vector (vector base)
 def SVLD1_GATHER_BASES_U   : MInst<"svld1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">;
 def SVLD1SB_GATHER_BASES_U : MInst<"svld1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl",   [IsGatherLoad],               MemEltTyInt8,    "aarch64_sve_ld1_gather_scalar_offset">;
@@ -134,7 +134,7 @@ def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl",               [I
 def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl",               [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ldff1">;
 }
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 // First-faulting load one vector (vector base)
 def SVLDFF1_GATHER_BASES_U   : MInst<"svldff1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">;
 def SVLDFF1SB_GATHER_BASES_U : MInst<"svldff1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl",   [IsGatherLoad],               MemEltTyInt8,    "aarch64_sve_ldff1_gather_scalar_offset">;
@@ -251,15 +251,15 @@ def SVLD3_VNUM : SInst<"svld3_vnum[_{2}]", "3Pcl", "csilUcUsUiUlhfdbm", MergeNon
 def SVLD4_VNUM : SInst<"svld4_vnum[_{2}]", "4Pcl", "csilUcUsUiUlhfdbm", MergeNone, "aarch64_sve_ld4_sret", [IsStructLoad, VerifyRuntimeMode]>;
 
 // Load one octoword and replicate (scalar base)
-let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
   def SVLD1RO : SInst<"svld1ro[_{2}]", "dPc", "csilUcUsUiUlhfdbm", MergeNone, "aarch64_sve_ld1ro">;
 }
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "bf16", SMETargetGuard = InvalidMode in {
   def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_bfmmla",       [IsOverloadNone]>;
 }
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
+let SVETargetGuard = "bf16", SMETargetGuard = "bf16" in {
   def SVBFDOT        : SInst<"svbfdot[_{0}]",        "MMdd",  "b", MergeNone, "aarch64_sve_bfdot",           [IsOverloadNone, VerifyRuntimeMode]>;
   def SVBFMLALB      : SInst<"svbfmlalb[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalb",         [IsOverloadNone, VerifyRuntimeMode]>;
   def SVBFMLALT      : SInst<"svbfmlalt[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalt",         [IsOverloadNone, VerifyRuntimeMode]>;
@@ -326,7 +326,7 @@ def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl",              [Is
 def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",                 [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",                [IsStore, VerifyRuntimeMode], MemEltTyInt32,   "aarch64_sve_st1">;
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 // Store one vector (vector base)
 def SVST1_SCATTER_BASES_U     : MInst<"svst1_scatter[_{2}base_{d}]",  "vPud",  "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">;
 def SVST1B_SCATTER_BASES_U    : MInst<"svst1b_scatter[_{2}base_{d}]", "vPud",  "ilUiUl",   [IsScatterStore], MemEltTyInt8,    "aarch64_sve_st1_scatter_scalar_offset">;
@@ -464,7 +464,7 @@ def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, VerifyRuntimeM
 def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, VerifyRuntimeMode], MemEltTyInt32, "aarch64_sve_prf">;
 def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, VerifyRuntimeMode], MemEltTyInt64, "aarch64_sve_prf">;
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 // Prefetch (Vector bases)
 def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8,  "aarch64_sve_prfb_gather_scalar_offset">;
 def SVPRFH_GATHER_BASES : MInst<"svprfh_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt16, "aarch64_sve_prfh_gather_scalar_offset">;
@@ -502,7 +502,7 @@ def SVPRFD_GATHER_BASES_OFFSET : MInst<"svprfd_gather[_{2}base]_index",  "vPdlJ"
 ////////////////////////////////////////////////////////////////////////////////
 // Address calculations
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 def SVADRB : SInst<"svadrb[_{0}base]_[{2}]offset", "uud", "ilUiUl", MergeNone, "aarch64_sve_adrb">;
 def SVADRH : SInst<"svadrh[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "aarch64_sve_adrh">;
 def SVADRW : SInst<"svadrw[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "aarch64_sve_adrw">;
@@ -778,11 +778,11 @@ defm SVRINTX : SInstZPZ<"svrintx", "hfd", "aarch64_sve_frintx">;
 defm SVRINTZ : SInstZPZ<"svrintz", "hfd", "aarch64_sve_frintz">;
 defm SVSQRT  : SInstZPZ<"svsqrt",  "hfd", "aarch64_sve_fsqrt">;
 
-let SVETargetGuard = "sve", SMETargetGuard = "sme2,ssve-fexpa" in {
+let SMETargetGuard = "sme2,ssve-fexpa" in {
 def SVEXPA  : SInst<"svexpa[_{d}]",  "du",   "hfd", MergeNone, "aarch64_sve_fexpa_x", [VerifyRuntimeMode]>;
 }
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 def SVTMAD  : SInst<"svtmad[_{d}]",  "dddi", "hfd", MergeNone, "aarch64_sve_ftmad_x", [], [ImmCheck<2, ImmCheck0_7>]>;
 def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu",  "hfd", MergeNone, "aarch64_sve_ftsmul_x">;
 def SVTSSEL : SInst<"svtssel[_{d}]", "ddu",  "hfd", MergeNone, "aarch64_sve_ftssel_x">;
@@ -825,7 +825,7 @@ def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frs
 ////////////////////////////////////////////////////////////////////////////////
 // Floating-point reductions
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 def SVFADDA : SInst<"svadda[_{d}]",   "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">;
 }
 
@@ -946,14 +946,14 @@ defm SVFCVT_F32_F64   : SInstCvtMXZ<"svcvt_f32[_f64]", "MMPd", "MPd", "d", "aarc
 defm SVFCVT_F64_F16   : SInstCvtMXZ<"svcvt_f64[_f16]", "ddPO", "dPO", "d", "aarch64_sve_fcvt_f64f16">;
 defm SVFCVT_F64_F32   : SInstCvtMXZ<"svcvt_f64[_f32]", "ddPM", "dPM", "d", "aarch64_sve_fcvt_f64f32">;
 
-let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
+let SVETargetGuard = "bf16", SMETargetGuard = "bf16" in {
 defm SVCVT_BF16_F32    : SInstCvtMXZ<"svcvt_bf16[_f32]", "$$Pd", "$Pd", "f", "aarch64_sve_fcvt_bf16f32_v2">;
 
 def SVCVTNT_BF16_F32   : SInst<"svcvtnt_bf16[_f32]", "$$Pd", "f", MergeOp1, "aarch64_sve_fcvtnt_bf16f32_v2", [IsOverloadNone, VerifyRuntimeMode]>;
 //  SVCVTNT_X_BF16_F32 : Implemented as macro by SveEmitter.cpp
 }
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVCVTLT_F32_F16 : SInstCvtMX<"svcvtlt_f32[_f16]",  "ddPh", "dPh", "f", "aarch64_sve_fcvtlt_f32f16">;
 defm SVCVTLT_F64_F32 : SInstCvtMX<"svcvtlt_f64[_f32]",  "ddPh", "dPh", "d", "aarch64_sve_fcvtlt_f64f32">;
 
@@ -980,8 +980,8 @@ defm SVCLASTA_N : SVEPerm<"svclasta[_n_{d}]", "sPsd", "aarch64_sve_clasta_n">;
 defm SVCLASTB   : SVEPerm<"svclastb[_{d}]",   "dPdd", "aarch64_sve_clastb">;
 defm SVCLASTB_N : SVEPerm<"svclastb[_n_{d}]", "sPsd", "aarch64_sve_clastb_n">;
 
-let SVETargetGuard = "sve", SMETargetGuard = "sme2p2" in {
-def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd", MergeNone, "aarch64_sve_compact", [VerifyRuntimeMode]>;
+let SMETargetGuard = "sme2p2" in {
+def SVCOMPACT : SInst<"svcompact[_{d}]", "dPd",  "ilUiUlfd", MergeNone, "aarch64_sve_compact", [VerifyRuntimeMode]>;
 }
 
 // Note: svdup_lane is implemented using the intrinsic for TBL to represent a
@@ -1088,7 +1088,7 @@ def SVPTEST_LAST  : SInst<"svptest_last",  "sPP", "Pc", MergeNone, "aarch64_sve_
 ////////////////////////////////////////////////////////////////////////////////
 // FFR manipulation
 
-let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in {
+let SMETargetGuard = InvalidMode in {
 def SVRDFFR   : SInst<"svrdffr",   "Pv", "Pc", MergeNone, "", [IsOverloadNone]>;
 def SVRDFFR_Z : SInst<"svrdffr_z", "PP", "Pc", MergeNone, "", [IsOverloadNone]>;
 def SVSETFFR  : SInst<"svsetffr",  "vv", "",   MergeNone, "", [IsOverloadNone]>;
@@ -1173,13 +1173,13 @@ def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "
 def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [VerifyRuntimeMode]>;
 def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [VerifyRuntimeMode]>;
 
-let SVETargetGuard = "sve,i8mm", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "i8mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_S32   : SInst<"svmmla[_s32]",   "ddqq","i",  MergeNone, "aarch64_sve_smmla">;
 def SVMLLA_U32   : SInst<"svmmla[_u32]",   "ddqq","Ui", MergeNone, "aarch64_sve_ummla">;
 def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i",  MergeNone, "aarch64_sve_usmmla">;
 }
 
-let SVETargetGuard = "sve,i8mm", SMETargetGuard = "sme,i8mm"in {
+let SVETargetGuard = "i8mm", SMETargetGuard = "i8mm"in {
 def SVUSDOT_S   : SInst<"svusdot[_s32]",   "ddbq", "i", MergeNone, "aarch64_sve_usdot", [VerifyRuntimeMode]>;
 def SVUSDOT_N_S : SInst<"svusdot[_n_s32]", "ddbr", "i", MergeNone, "aarch64_sve_usdot", [VerifyRuntimeMode]>;
 def SVSUDOT_S   : SInst<"svsudot[_s32]",   "ddqb", "i", MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, VerifyRuntimeMode]>;
@@ -1189,11 +1189,11 @@ def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]", "ddbqi",  "i", MergeNone, "aarc
 def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi",  "i", MergeNone, "aarch64_sve_sudot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 }
 
-let SVETargetGuard = "sve,f32mm", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;
 }
 
-let SVETargetGuard = "sve,f64mm", SMETargetGuard = InvalidMode in {
+let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]",  "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
 
 def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
@@ -1243,7 +1243,7 @@ let SVETargetGuard = "sve2p1|sme2", SMETargetGuard = "sve2p1|sme2" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 WhileGE/GT
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
 def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl", MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
 def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl", MergeNone, "aarch64_sve_whilegt", [IsOverloadWhileOrMultiVecCvt, VerifyRuntimeMode]>;
@@ -1268,7 +1268,7 @@ let SVETargetGuard = "sve2p1|sme2", SMETargetGuard = "sve2p1|sme2" in {
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Uniform DSP operations
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVQADD_S  : SInstZPZZ<"svqadd",  "csli",     "aarch64_sve_sqadd",  "aarch64_sve_sqadd">;
 defm SVQADD_U  : SInstZPZZ<"svqadd",  "UcUsUiUl", "aarch64_sve_uqadd",  "aarch64_sve_uqadd">;
 defm SVHADD_S  : SInstZPZZ<"svhadd",  "csli",     "aarch64_sve_shadd",  "aarch64_sve_shadd">;
@@ -1303,7 +1303,7 @@ multiclass SInstZPZxZ<string name, string types, string pat_v, string pat_n, str
   def _N_Z : SInst<name # "[_n_{d}]", pat_n, types, MergeZero, intrinsic, flags>;
 }
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil",     "dPdx", "dPdK", "aarch64_sve_sqrshl", [VerifyRuntimeMode]>;
 defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [VerifyRuntimeMode]>;
 defm SVQSHL_S  : SInstZPZxZ<"svqshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_sqshl", [VerifyRuntimeMode]>;
@@ -1357,7 +1357,7 @@ multiclass SInstPairwise<string name, string types, string intrinsic, list<FlagT
   def _X   : SInst<name # "[_{d}]", "dPdd", types, MergeAny, intrinsic, flags>;
 }
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVADDP   : SInstPairwise<"svaddp",   "csliUcUsUiUl", "aarch64_sve_addp", [VerifyRuntimeMode]>;
 defm SVADDP_F : SInstPairwise<"svaddp",   "hfd",          "aarch64_sve_faddp", [VerifyRuntimeMode]>;
 defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd",          "aarch64_sve_fmaxnmp", [VerifyRuntimeMode]>;
@@ -1373,7 +1373,7 @@ defm SVMINP_U : SInstPairwise<"svminp",   "UcUsUiUl",     "aarch64_sve_uminp", [
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Widening pairwise arithmetic
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeOp1,  "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
 def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeAny,  "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
 def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil", MergeZero, "aarch64_sve_sadalp", [VerifyRuntimeMode]>;
@@ -1387,7 +1387,7 @@ def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_s
 // SVE2 - Bitwise ternary logical instructions
 //
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVBCAX  : SInst<"svbcax[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [VerifyRuntimeMode]>;
 def SVBSL   : SInst<"svbsl[_{d}]",   "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [VerifyRuntimeMode]>;
 def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [VerifyRuntimeMode]>;
@@ -1407,7 +1407,7 @@ def SVXAR_N   : SInst<"svxar[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aar
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Large integer arithmetic
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [VerifyRuntimeMode]>;
 def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [VerifyRuntimeMode]>;
 def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [VerifyRuntimeMode]>;
@@ -1422,7 +1422,7 @@ def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Multiplication by indexed elements
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
@@ -1430,7 +1430,7 @@ def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "a
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Uniform complex integer arithmetic
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVCADD             : SInst<"svcadd[_{d}]",          "dddi",   "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x",           [VerifyRuntimeMode], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
 def SVSQCADD           : SInst<"svqcadd[_{d}]",         "dddi",   "csil",         MergeNone, "aarch64_sve_sqcadd_x",         [VerifyRuntimeMode], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
 def SVCMLA             : SInst<"svcmla[_{d}]",          "ddddi",  "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x",           [VerifyRuntimeMode], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
@@ -1457,7 +1457,7 @@ multiclass SInstWideDSPWide<string name, string types, string intrinsic> {
   def _N : SInst<name # "[_n_{d}]", "ddR", types, MergeNone, intrinsic, [VerifyRuntimeMode]>;
 }
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVABALB_S : SInstWideDSPAcc<"svabalb",   "sil",    "aarch64_sve_sabalb">;
 defm SVABALB_U : SInstWideDSPAcc<"svabalb",   "UsUiUl", "aarch64_sve_uabalb">;
 defm SVABALT_S : SInstWideDSPAcc<"svabalt",   "sil",    "aarch64_sve_sabalt">;
@@ -1536,7 +1536,7 @@ def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi",  "il",   MergeNone, "
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Narrowing DSP operations
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVADDHNB  : SInst<"svaddhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [VerifyRuntimeMode]>;
 def SVADDHNT  : SInst<"svaddhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [VerifyRuntimeMode]>;
 def SVRADDHNB : SInst<"svraddhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [VerifyRuntimeMode]>;
@@ -1576,7 +1576,7 @@ def SVQRSHRNT_U : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "UsUiUl",    MergeNone, "a
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Unary narrowing operations
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVQXTNB_S  : SInst<"svqxtnb[_{d}]",  "hd",  "sil",    MergeNone, "aarch64_sve_sqxtnb", [VerifyRuntimeMode]>;
 def SVQXTNB_U  : SInst<"svqxtnb[_{d}]",  "hd",  "UsUiUl", MergeNone, "aarch64_sve_uqxtnb", [VerifyRuntimeMode]>;
 def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed",  "sil",    MergeNone, "aarch64_sve_sqxtunb", [VerifyRuntimeMode]>;
@@ -1589,7 +1589,7 @@ def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil",    MergeNone, "aarch64_sv
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Widening complex integer arithmetic
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 defm SVADDLBT : SInstWideDSPLong<"svaddlbt", "sil", "aarch64_sve_saddlbt">;
 defm SVSUBLBT : SInstWideDSPLong<"svsublbt", "sil", "aarch64_sve_ssublbt">;
 defm SVSUBLTB : SInstWideDSPLong<"svsubltb", "sil", "aarch64_sve_ssubltb">;
@@ -1723,7 +1723,7 @@ def SVSTNT1W_SCATTER_INDEX_S : MInst<"svstnt1w_scatter[_{2}base]_index[_{d}]", "
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Polynomial arithmetic
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVEORBT         : SInst<"sveorbt[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [VerifyRuntimeMode]>;
 def SVEORBT_N       : SInst<"sveorbt[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [VerifyRuntimeMode]>;
 def SVEORTB         : SInst<"sveortb[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [VerifyRuntimeMode]>;
@@ -1744,7 +1744,7 @@ def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi", MergeNone,
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Complex integer dot product
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVCDOT      : SInst<"svcdot[_{d}]",      "ddqqi",  "il",   MergeNone, "aarch64_sve_cdot",      [VerifyRuntimeMode], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
 def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch64_sve_cdot_lane", [VerifyRuntimeMode], [ImmCheck<4, ImmCheckComplexRotAll90>, ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 }
@@ -1752,7 +1752,7 @@ def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Floating-point widening multiply-accumulate
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVMLALB_F      : SInst<"svmlalb[_{d}]",      "ddhh",  "f", MergeNone, "aarch64_sve_fmlalb", [VerifyRuntimeMode]>;
 def SVMLALB_F_N    : SInst<"svmlalb[_n_{d}]",    "ddhR",  "f", MergeNone, "aarch64_sve_fmlalb", [VerifyRuntimeMode]>;
 def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch64_sve_fmlalb_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
@@ -1770,7 +1770,7 @@ def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f", MergeNone, "aarch
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Floating-point integer binary logarithm
 
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVLOGB_M : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1,     "aarch64_sve_flogb", [VerifyRuntimeMode]>;
 def SVLOGB_X : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeAnyExp,  "aarch64_sve_flogb", [VerifyRuntimeMode]>;
 def SVLOGB_Z : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeZeroExp, "aarch64_sve_flogb", [VerifyRuntimeMode]>;
@@ -1794,7 +1794,7 @@ def SVNMATCH : SInst<"svnmatch[_{d}]", "PPdd", "csUcUs", MergeNone, "aarch64_sve
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Contiguous conflict detection
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUshb", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, VerifyRuntimeMode]>;
 def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, VerifyRuntimeMode]>;
@@ -1808,7 +1808,7 @@ def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sv
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Extended table lookup/permute
-let SVETargetGuard = "sve2", SMETargetGuard = "sme" in {
+let SVETargetGuard = "sve2" in {
 def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfdb", MergeNone, "", [VerifyRuntimeMode]>;
 def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>;
 }
@@ -1850,7 +1850,7 @@ def SVSM4E    : SInst<"svsm4e[_{d}]",    "ddd", "Ui", MergeNone, "aarch64_sve_sm
 def SVSM4EKEY : SInst<"svsm4ekey[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm4ekey", [IsOverloadNone]>;
 }
 
-let SVETargetGuard = "sve2,sve-bitperm", SMETargetGuard = "sme,ssve-bitperm" in {
+let SVETargetGuard = "sve2,sve-bitperm", SMETargetGuard = "ssve-bitperm" in {
 def SVBDEP   : SInst<"svbdep[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x", [VerifyRuntimeMode]>;
 def SVBDEP_N : SInst<"svbdep[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x", [VerifyRuntimeMode]>;
 def SVBEXT   : SInst<"svbext[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bext_x", [VerifyRuntimeMode]>;
@@ -1859,7 +1859,7 @@ def SVBGRP   : SInst<"svbgrp[_{d}]",   "ddd", "UcUsUiUl", MergeNone, "aarch64_sv
 def SVBGRP_N : SInst<"svbgrp[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bgrp_x", [VerifyRuntimeMode]>;
 }
 
-let SVETargetGuard = "sve2p1|sme", SMETargetGuard = "sve2p1|sme" in {
+let SVETargetGuard = "sve2p1|sme" in {
 def SVPSEL_B : SInst<"svpsel_lane_b8",  "PPPm", "Pc", MergeNone, "", [VerifyRuntimeMode], []>;
 def SVPSEL_H : SInst<"svpsel_lane_b16", "PPPm", "Ps", MergeNone, "", [VerifyRuntimeMode], []>;
 def SVPSEL_S : SInst<"svpsel_lane_b32", "PPPm", "Pi", MergeNone, "", [VerifyRuntimeMode], []>;
@@ -1965,7 +1965,7 @@ def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}]", "ddhhi", "f",  MergeNone, "a
 def SVFCLAMP : SInst<"svclamp[_{d}]", "dddd", "hfd", MergeNone, "aarch64_sve_fclamp", [VerifyRuntimeMode], []>;
 }
 
-let SVETargetGuard = "sve2p1|sme", SMETargetGuard = "sve2p1|sme" in {
+let SVETargetGuard = "sve2p1|sme" in {
 def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [VerifyRuntimeMode], []>;
 def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [VerifyRuntimeMode], []>;
 
@@ -2340,7 +2340,7 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
   def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode]>;
 }
 
-let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="ssve-fp8dot2" in {
   // 8-bit floating-point dot product to half-precision (vectors)
   def SVFDOT_2WAY   :  SInst<"svdot[_f16_mf8]",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
   def SVFDOT_N_2WAY :  SInst<"svdot[_n_f16_mf8]", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
@@ -2349,7 +2349,7 @@ let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
   def SVFDOT_LANE_2WAY :  SInst<"svdot_lane[_f16_mf8]", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
-let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="ssve-fp8dot4" in {
   // 8-bit floating-point dot product to single-precision (vectors)
   def SVFDOT_4WAY   : SInst<"svdot[_f32_mf8]",   "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
   def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode]>;
@@ -2358,7 +2358,7 @@ let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
   def SVFDOT_LANE_4WAY :  SInst<"svdot_lane[_f32_mf8]", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
-let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in {
+let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "ssve-fp8fma" in {
   // 8-bit floating-point multiply-add long to half-precision (bottom)
   def SVFMLALB   : SInst<"svmlalb[_f16_mf8]",   "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode]>;
   def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode]>;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 51cef23..b64fd27 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1061,6 +1061,21 @@ def CIR_BrOp : CIR_Op<"br",[
 }
 
 //===----------------------------------------------------------------------===//
+// LabelOp
+//===----------------------------------------------------------------------===//
+
+// The LabelOp has AlwaysSpeculatable trait in order to not to be swept
+// by canonicalizer
+def CIR_LabelOp : CIR_Op<"label", [AlwaysSpeculatable]> {
+  let description = [{
+    An identifier which may be referred by cir.goto operation
+  }];
+  let arguments = (ins StrAttr:$label);
+  let assemblyFormat = [{ $label attr-dict }];
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // UnaryOp
 //===----------------------------------------------------------------------===//
 
@@ -3295,4 +3310,28 @@ def CIR_ExpectOp : CIR_Op<"expect", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Floating Point Ops
+//===----------------------------------------------------------------------===//
+
+class CIR_UnaryFPToFPBuiltinOp<string mnemonic, string llvmOpName>
+    : CIR_Op<mnemonic, [Pure, SameOperandsAndResultType]>
+{
+  let arguments = (ins CIR_AnyFloatOrVecOfFloatType:$src);
+  let results = (outs CIR_AnyFloatOrVecOfFloatType:$result);
+
+  let assemblyFormat = "$src `:` type($src) attr-dict";
+
+  let llvmOp = llvmOpName;
+}
+
+def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
+  let summary = "Computes the floating-point absolute value";
+  let description = [{
+    `cir.fabs` computes the absolute value of a floating-point operand
+    and returns a result of the same type, ignoring floating-point
+    exceptions. It does not set `errno`.
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6aab43c..7712a49 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3731,14 +3731,20 @@ def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
 def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">,
   Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>,
   Visibility<[ClangOption, CC1Option]>;
-def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
+def fopenmp_simd : Flag<["-"], "fopenmp-simd">,
+                   Group<f_Group>,
+                   Flags<[NoArgumentUnused]>,
+                   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+                   HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
 def fopenmp_enable_irbuilder : Flag<["-"], "fopenmp-enable-irbuilder">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Use the experimental OpenMP-IR-Builder codegen path.">;
-def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>;
+def fno_openmp_simd
+    : Flag<["-"], "fno-openmp-simd">,
+      Group<f_Group>,
+      Flags<[NoArgumentUnused]>,
+      Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+      HelpText<"Do not emit code for any OpenMP constructs.">;
 def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
 def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
@@ -5589,7 +5595,8 @@ def mno_outline_atomics : Flag<["-"], "mno-outline-atomics">, Group<f_clang_Grou
   HelpText<"Don't generate local calls to out-of-line atomic operations">;
 def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
   HelpText<"Don't generate implicit floating point or vector instructions">;
-def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
+def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>,
+  HelpText<"Generate implicit floating point or vector instructions">;
 def mrecip : Flag<["-"], "mrecip">, Group<m_Group>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Equivalent to '-mrecip=all'">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1dfc276..f933803 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2659,9 +2659,9 @@ public:
   /// identifies the magic value.
   typedef std::pair<const IdentifierInfo *, uint64_t> TypeTagMagicValue;
 
-  /// Diagnoses the current set of gathered accesses. This typically
-  /// happens at full expression level. The set is cleared after emitting the
-  /// diagnostics.
+  /// Diagnoses the current set of gathered accesses. This happens at the end of
+  /// each expression evaluation context. Diagnostics are emitted only for
+  /// accesses gathered in the current evaluation context.
   void DiagnoseMisalignedMembers();
 
   /// This function checks if the expression is in the sef of potentially
@@ -3117,9 +3117,6 @@ private:
 
     bool operator==(const MisalignedMember &m) { return this->E == m.E; }
   };
-  /// Small set of gathered accesses to potentially misaligned members
-  /// due to the packed attribute.
-  SmallVector<MisalignedMember, 4> MisalignedMembers;
 
   /// Adds an expression to the set of gathered misaligned members.
   void AddPotentialMisalignedMembers(Expr *E, RecordDecl *RD, ValueDecl *MD,
@@ -6765,6 +6762,10 @@ public:
     /// InLifetimeExtendingContext is true.
     SmallVector<MaterializeTemporaryExpr *, 8> ForRangeLifetimeExtendTemps;
 
+    /// Small set of gathered accesses to potentially misaligned members
+    /// due to the packed attribute.
+    SmallVector<MisalignedMember, 4> MisalignedMembers;
+
     /// \brief Describes whether we are in an expression constext which we have
     /// to handle differently.
     enum ExpressionKind {
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 234fa2c..9ecc7b6 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -473,9 +473,7 @@ bool Descriptor::hasTrivialDtor() const {
 bool Descriptor::isUnion() const { return isRecord() && ElemRecord->isUnion(); }
 
 InitMap::InitMap(unsigned N)
-    : UninitFields(N), Data(std::make_unique<T[]>(numFields(N))) {
-  std::fill_n(data(), numFields(N), 0);
-}
+    : UninitFields(N), Data(std::make_unique<T[]>(numFields(N))) {}
 
 bool InitMap::initializeElement(unsigned I) {
   unsigned Bucket = I / PER_FIELD;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index ee2d532..b602b97 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -598,6 +598,17 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static inline Floating abs(InterpState &S, const Floating &In) {
+  if (!In.isNegative())
+    return In;
+
+  Floating Output = S.allocFloat(In.getSemantics());
+  APFloat New = In.getAPFloat();
+  New.changeSign();
+  Output.copy(New);
+  return Output;
+}
+
 // The C standard says "fabs raises no floating-point exceptions,
 // even if x is a signaling NaN. The returned value is independent of
 // the current rounding direction mode."  Therefore constant folding can
@@ -606,16 +617,7 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
-  APFloat F = Val.getAPFloat();
-  if (!F.isNegative()) {
-    S.Stk.push<Floating>(Val);
-    return true;
-  }
-
-  Floating Result = S.allocFloat(Val.getSemantics());
-  F.changeSign();
-  Result.copy(F);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(abs(S, Val));
   return true;
 }
 
@@ -1686,6 +1688,57 @@ static bool interp__builtin_vector_reduce(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_elementwise_abs(InterpState &S, CodePtr OpPC,
+                                            const InterpFrame *Frame,
+                                            const CallExpr *Call,
+                                            unsigned BuiltinID) {
+  assert(Call->getNumArgs() == 1);
+  QualType Ty = Call->getArg(0)->getType();
+  if (Ty->isIntegerType()) {
+    PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
+    APSInt Val = popToAPSInt(S.Stk, ArgT);
+
+    pushInteger(S, Val.abs(), Call->getType());
+    return true;
+  }
+
+  if (Ty->isFloatingType()) {
+    Floating Val = S.Stk.pop<Floating>();
+    Floating Result = abs(S, Val);
+    S.Stk.push<Floating>(Result);
+    return true;
+  }
+
+  // Otherwise, the argument must be a vector.
+  assert(Call->getArg(0)->getType()->isVectorType());
+  const Pointer &Arg = S.Stk.pop<Pointer>();
+  assert(Arg.getFieldDesc()->isPrimitiveArray());
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  assert(Dst.getFieldDesc()->isPrimitiveArray());
+  assert(Arg.getFieldDesc()->getNumElems() ==
+         Dst.getFieldDesc()->getNumElems());
+
+  QualType ElemType = Arg.getFieldDesc()->getElemQualType();
+  PrimType ElemT = *S.getContext().classify(ElemType);
+  unsigned NumElems = Arg.getNumElems();
+  // we can either have a vector of integer or a vector of floating point
+  for (unsigned I = 0; I != NumElems; ++I) {
+    if (ElemType->isIntegerType()) {
+      INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+        Dst.elem<T>(I) = T::from(static_cast<T>(
+            APSInt(Arg.elem<T>(I).toAPSInt().abs(),
+                   ElemType->isUnsignedIntegerOrEnumerationType())));
+      });
+    } else {
+      Floating Val = Arg.elem<Floating>(I);
+      Dst.elem<Floating>(I) = abs(S, Val);
+    }
+  }
+  Dst.initializeAllElements();
+
+  return true;
+}
+
 /// Can be called with an integer or vector as the first and only parameter.
 static bool interp__builtin_elementwise_popcount(InterpState &S, CodePtr OpPC,
                                                  const InterpFrame *Frame,
@@ -2774,6 +2827,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     return interp__builtin_elementwise_popcount(S, OpPC, Frame, Call,
                                                 BuiltinID);
 
+  case Builtin::BI__builtin_elementwise_abs:
+    return interp__builtin_elementwise_abs(S, OpPC, Frame, Call, BuiltinID);
+
   case Builtin::BI__builtin_memcpy:
   case Builtin::BImemcpy:
   case Builtin::BI__builtin_wmemcpy:
diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
index e19c232..a0903d0 100644
--- a/clang/lib/AST/CommentLexer.cpp
+++ b/clang/lib/AST/CommentLexer.cpp
@@ -214,7 +214,7 @@ bool isCommandNameStartCharacter(char C) {
 }
 
 bool isCommandNameCharacter(char C) {
-  return isAlphanumeric(C);
+  return isAsciiIdentifierContinue(C, false);
 }
 
 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 36dd0f5..7d45422 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11639,6 +11639,29 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case Builtin::BI__builtin_elementwise_abs: {
+    APValue Source;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source))
+      return false;
+
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    unsigned SourceLen = Source.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+      APValue CurrentEle = Source.getVectorElt(EltNum);
+      APValue Val = DestEltTy->isFloatingType()
+                        ? APValue(llvm::abs(CurrentEle.getFloat()))
+                        : APValue(APSInt(
+                              CurrentEle.getInt().abs(),
+                              DestEltTy->isUnsignedIntegerOrEnumerationType()));
+      ResultElements.push_back(Val);
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat:
   case clang::X86::BI__builtin_ia32_pmulhuw128:
@@ -13387,6 +13410,14 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(Operand, E);
   }
 
+  case Builtin::BI__builtin_elementwise_abs: {
+    APSInt Val;
+    if (!EvaluateInteger(E->getArg(0), Val, Info))
+      return false;
+
+    return Success(Val.abs(), E);
+  }
+
   case Builtin::BI__builtin_expect:
   case Builtin::BI__builtin_expect_with_probability:
     return Visit(E->getArg(0));
@@ -15878,6 +15909,7 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return Error(E);
     return true;
 
+  case Builtin::BI__builtin_elementwise_abs:
   case Builtin::BI__builtin_fabs:
   case Builtin::BI__builtin_fabsf:
   case Builtin::BI__builtin_fabsl:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 16fc650..36aea4c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -72,6 +72,19 @@ RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) {
   return RValue::get(r);
 }
 
+template <class Operation>
+static RValue emitUnaryMaybeConstrainedFPBuiltin(CIRGenFunction &cgf,
+                                                 const CallExpr &e) {
+  mlir::Value arg = cgf.emitScalarExpr(e.getArg(0));
+
+  assert(!cir::MissingFeatures::cgFPOptionsRAII());
+  assert(!cir::MissingFeatures::fpConstraints());
+
+  auto call =
+      Operation::create(cgf.getBuilder(), arg.getLoc(), arg.getType(), arg);
+  return RValue::get(call->getResult(0));
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
                                        ReturnValueSlot returnValue) {
@@ -112,6 +125,16 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   default:
     break;
 
+  case Builtin::BIfabs:
+  case Builtin::BIfabsf:
+  case Builtin::BIfabsl:
+  case Builtin::BI__builtin_fabs:
+  case Builtin::BI__builtin_fabsf:
+  case Builtin::BI__builtin_fabsf16:
+  case Builtin::BI__builtin_fabsl:
+  case Builtin::BI__builtin_fabsf128:
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::FAbsOp>(*this, *e);
+
   case Builtin::BI__assume:
   case Builtin::BI__builtin_assume: {
     if (e->getArg(0)->HasSideEffects(getContext()))
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 1fda848..6b6ac701 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -124,8 +124,8 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   const QualType elementType =
       cgf.getContext().getAsArrayType(arrayQTy)->getElementType();
 
-  if (elementType.isDestructedType()) {
-    cgf.cgm.errorNYI(loc, "dtorKind NYI");
+  if (elementType.isDestructedType() && cgf.cgm.getLangOpts().Exceptions) {
+    cgf.cgm.errorNYI(loc, "initialized array requires destruction");
     return;
   }
 
@@ -135,9 +135,9 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   const cir::PointerType cirElementPtrType =
       builder.getPointerTo(cirElementType);
 
-  auto begin = builder.create<cir::CastOp>(loc, cirElementPtrType,
-                                           cir::CastKind::array_to_ptrdecay,
-                                           destPtr.getPointer());
+  auto begin = cir::CastOp::create(builder, loc, cirElementPtrType,
+                                   cir::CastKind::array_to_ptrdecay,
+                                   destPtr.getPointer());
 
   const CharUnits elementSize =
       cgf.getContext().getTypeSizeInChars(elementType);
@@ -182,8 +182,8 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
     // Advance to the start of the rest of the array.
     if (numInitElements) {
       one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1);
-      element = builder.create<cir::PtrStrideOp>(loc, cirElementPtrType,
-                                                 element, one);
+      element = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
+                                         element, one);
     }
 
     // Allocate the temporary variable
@@ -193,25 +193,52 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
     LValue tmpLV = cgf.makeAddrLValue(tmpAddr, elementPtrType);
     cgf.emitStoreThroughLValue(RValue::get(element), tmpLV);
 
-    // TODO(CIR): Replace this part later with cir::DoWhileOp
-    for (unsigned i = numInitElements; i != numArrayElements; ++i) {
-      cir::LoadOp currentElement = builder.createLoad(loc, tmpAddr);
-
-      // Emit the actual filler expression.
-      const LValue elementLV = cgf.makeAddrLValue(
-          Address(currentElement, cirElementType, elementAlign), elementType);
-
-      if (arrayFiller)
-        emitInitializationToLValue(arrayFiller, elementLV);
-      else
-        emitNullInitializationToLValue(loc, elementLV);
-
-      // Advance pointer and store them to temporary variable
-      one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1);
-      cir::PtrStrideOp nextElement =
-          builder.createPtrStride(loc, currentElement, one);
-      cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV);
-    }
+    // Compute the end of array
+    cir::ConstantOp numArrayElementsConst = builder.getConstInt(
+        loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), numArrayElements);
+    mlir::Value end = cir::PtrStrideOp::create(builder, loc, cirElementPtrType,
+                                               begin, numArrayElementsConst);
+
+    builder.createDoWhile(
+        loc,
+        /*condBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          cir::LoadOp currentElement = builder.createLoad(loc, tmpAddr);
+          mlir::Type boolTy = cgf.convertType(cgf.getContext().BoolTy);
+          cir::CmpOp cmp = cir::CmpOp::create(
+              builder, loc, boolTy, cir::CmpOpKind::ne, currentElement, end);
+          builder.createCondition(cmp);
+        },
+        /*bodyBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          cir::LoadOp currentElement = builder.createLoad(loc, tmpAddr);
+
+          assert(!cir::MissingFeatures::requiresCleanups());
+
+          // Emit the actual filler expression.
+          LValue elementLV = cgf.makeAddrLValue(
+              Address(currentElement, cirElementType, elementAlign),
+              elementType);
+          if (arrayFiller)
+            emitInitializationToLValue(arrayFiller, elementLV);
+          else
+            emitNullInitializationToLValue(loc, elementLV);
+
+          // Tell the EH cleanup that we finished with the last element.
+          if (cgf.cgm.getLangOpts().Exceptions) {
+            cgf.cgm.errorNYI(loc, "update destructed array element for EH");
+            return;
+          }
+
+          // Advance pointer and store them to temporary variable
+          cir::ConstantOp one = builder.getConstInt(
+              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 1);
+          auto nextElement = cir::PtrStrideOp::create(
+              builder, loc, cirElementPtrType, currentElement, one);
+          cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV);
+
+          builder.createYield(loc);
+        });
   }
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 2333ec3..ddc1edd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1181,6 +1181,9 @@ public:
 
   mlir::Value emitOpOnBoolExpr(mlir::Location loc, const clang::Expr *cond);
 
+  mlir::LogicalResult emitLabel(const clang::LabelDecl &d);
+  mlir::LogicalResult emitLabelStmt(const clang::LabelStmt &s);
+
   mlir::LogicalResult emitIfStmt(const clang::IfStmt &s);
 
   /// Emit code to compute the specified expression,
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 45dfcf5..d529688 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -438,6 +438,20 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
     errorNYI(funcDecl->getSourceRange(), "deferredAnnotations");
 }
 
+void CIRGenModule::handleCXXStaticMemberVarInstantiation(VarDecl *vd) {
+  VarDecl::DefinitionKind dk = vd->isThisDeclarationADefinition();
+  if (dk == VarDecl::Definition && vd->hasAttr<DLLImportAttr>())
+    return;
+
+  TemplateSpecializationKind tsk = vd->getTemplateSpecializationKind();
+  // If we have a definition, this might be a deferred decl. If the
+  // instantiation is explicit, make sure we emit it at the end.
+  if (vd->getDefinition() && tsk == TSK_ExplicitInstantiationDefinition)
+    getAddrOfGlobalVar(vd);
+
+  emitTopLevelDecl(vd);
+}
+
 mlir::Operation *CIRGenModule::getGlobalValue(StringRef name) {
   return mlir::SymbolTable::lookupSymbolIn(theModule, name);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 5538aba..283b76a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -120,6 +120,9 @@ public:
 
   mlir::Operation *lastGlobalOp = nullptr;
 
+  /// Tell the consumer that this variable has been instantiated.
+  void handleCXXStaticMemberVarInstantiation(VarDecl *vd);
+
   llvm::DenseMap<const Decl *, cir::GlobalOp> staticLocalDeclMap;
 
   mlir::Operation *getGlobalValue(llvm::StringRef ref);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 332babd..dffe8b4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -256,6 +256,9 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s,
   // NullStmt doesn't need any handling, but we need to say we handled it.
   case Stmt::NullStmtClass:
     break;
+
+  case Stmt::LabelStmtClass:
+    return emitLabelStmt(cast<LabelStmt>(*s));
   case Stmt::CaseStmtClass:
   case Stmt::DefaultStmtClass:
     // If we reached here, we must not handling a switch case in the top level.
@@ -272,6 +275,17 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s,
   return mlir::success();
 }
 
+mlir::LogicalResult CIRGenFunction::emitLabelStmt(const clang::LabelStmt &s) {
+
+  if (emitLabel(*s.getDecl()).failed())
+    return mlir::failure();
+
+  if (getContext().getLangOpts().EHAsynch && s.isSideEntry())
+    getCIRGenModule().errorNYI(s.getSourceRange(), "IsEHa: not implemented.");
+
+  return emitStmt(s.getSubStmt(), /*useCurrentScope*/ true);
+}
+
 // Add a terminating yield on a body region if no other terminators are used.
 static void terminateBody(CIRGenBuilderTy &builder, mlir::Region &r,
                           mlir::Location loc) {
@@ -429,6 +443,32 @@ CIRGenFunction::emitContinueStmt(const clang::ContinueStmt &s) {
   return mlir::success();
 }
 
+mlir::LogicalResult CIRGenFunction::emitLabel(const clang::LabelDecl &d) {
+  // Create a new block to tag with a label and add a branch from
+  // the current one to it. If the block is empty just call attach it
+  // to this label.
+  mlir::Block *currBlock = builder.getBlock();
+  mlir::Block *labelBlock = currBlock;
+
+  if (!currBlock->empty()) {
+    {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      labelBlock = builder.createBlock(builder.getBlock()->getParent());
+    }
+    builder.create<cir::BrOp>(getLoc(d.getSourceRange()), labelBlock);
+  }
+
+  builder.setInsertionPointToEnd(labelBlock);
+  builder.create<cir::LabelOp>(getLoc(d.getSourceRange()), d.getName());
+  builder.setInsertionPointToEnd(labelBlock);
+
+  //  FIXME: emit debug info for labels, incrementProfileCounter
+  assert(!cir::MissingFeatures::ehstackBranches());
+  assert(!cir::MissingFeatures::incrementProfileCounter());
+  assert(!cir::MissingFeatures::generateDebugInfo());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRGenFunction::emitBreakStmt(const clang::BreakStmt &s) {
   builder.createBreak(getLoc(s.getBreakLoc()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
index b0357d9..fb013d1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenerator.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp
@@ -163,7 +163,7 @@ void CIRGenerator::HandleCXXStaticMemberVarInstantiation(VarDecl *D) {
   if (diags.hasErrorOccurred())
     return;
 
-  cgm->errorNYI(D->getSourceRange(), "HandleCXXStaticMemberVarInstantiation");
+  cgm->handleCXXStaticMemberVarInstantiation(D);
 }
 
 void CIRGenerator::CompleteTentativeDefinition(VarDecl *d) {
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 7c84294..936247e9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1785,6 +1785,19 @@ LogicalResult cir::ShiftOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// LabelOp Definitions
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::LabelOp::verify() {
+  mlir::Operation *op = getOperation();
+  mlir::Block *blk = op->getBlock();
+  if (&blk->front() != op)
+    return emitError() << "must be the first operation in a block";
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
 // UnaryOp
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 2eaa60c..d41ea0a 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -47,8 +47,8 @@ struct RemoveRedundantBranches : public OpRewritePattern<BrOp> {
     Block *block = op.getOperation()->getBlock();
     Block *dest = op.getDest();
 
-    assert(!cir::MissingFeatures::labelOp());
-
+    if (isa<cir::LabelOp>(dest->front()))
+      return failure();
     // Single edge between blocks: merge it.
     if (block->getNumSuccessors() == 1 &&
         dest->getSinglePredecessor() == block) {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 88a0fe2..ad5f520 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1296,6 +1296,15 @@ mlir::LogicalResult CIRToLLVMExpectOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMFAbsOpLowering::matchAndRewrite(
+    cir::FAbsOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::FAbsOp>(op, resTy,
+                                                  adaptor.getOperands()[0]);
+  return mlir::success();
+}
+
 /// Convert the `cir.func` attributes to `llvm.func` attributes.
 /// Only retain those attributes that are not constructed by
 /// `LLVMFuncOp::build`. If `filterArgAttrs` is set, also filter out
@@ -2291,6 +2300,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMComplexSubOpLowering,
                CIRToLLVMConstantOpLowering,
                CIRToLLVMExpectOpLowering,
+               CIRToLLVMFAbsOpLowering,
                CIRToLLVMFuncOpLowering,
                CIRToLLVMGetBitfieldOpLowering,
                CIRToLLVMGetGlobalOpLowering,
@@ -2313,7 +2323,6 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecSplatOpLowering,
                CIRToLLVMVecTernaryOpLowering,
                CIRToLLVMUnreachableOpLowering
-      // clang-format on
       >(converter, patterns.getContext());
 
   processCIRAttrs(module);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 51b191a..a6d2d65 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -648,6 +648,15 @@ public:
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMFAbsOpLowering : public mlir::OpConversionPattern<cir::FAbsOp> {
+public:
+  using mlir::OpConversionPattern<cir::FAbsOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::FAbsOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 } // namespace direct
 } // namespace cir
 
diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h
index a748dda..4e7f356 100644
--- a/clang/lib/CodeGen/Address.h
+++ b/clang/lib/CodeGen/Address.h
@@ -176,6 +176,11 @@ public:
   static Address invalid() { return Address(nullptr); }
   bool isValid() const { return Pointer.getPointer() != nullptr; }
 
+  llvm::Value *getPointerIfNotSigned() const {
+    assert(isValid() && "pointer isn't valid");
+    return !isSigned() ? Pointer.getPointer() : nullptr;
+  }
+
   /// This function is used in situations where the caller is doing some sort of
   /// opaque "laundering" of the pointer.
   void replaceBasePointer(llvm::Value *P) {
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index cfeba6f..74d92ef0 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -188,13 +188,14 @@ static llvm::Constant *buildBlockDescriptor(CodeGenModule &CGM,
   // Optional copy/dispose helpers.
   bool hasInternalHelper = false;
   if (blockInfo.NeedsCopyDispose) {
+    auto &Schema = CGM.getCodeGenOpts().PointerAuth.BlockHelperFunctionPointers;
     // copy_func_helper_decl
     llvm::Constant *copyHelper = buildCopyHelper(CGM, blockInfo);
-    elements.add(copyHelper);
+    elements.addSignedPointer(copyHelper, Schema, GlobalDecl(), QualType());
 
     // destroy_func_decl
     llvm::Constant *disposeHelper = buildDisposeHelper(CGM, blockInfo);
-    elements.add(disposeHelper);
+    elements.addSignedPointer(disposeHelper, Schema, GlobalDecl(), QualType());
 
     if (cast<llvm::Function>(copyHelper->stripPointerCasts())
             ->hasInternalLinkage() ||
@@ -568,9 +569,8 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF,
       llvm::StructType::get(CGM.getLLVMContext(), elementTypes, true);
     info.CanBeGlobal = true;
     return;
-  }
-  else if (C.getLangOpts().ObjC &&
-           CGM.getLangOpts().getGC() == LangOptions::NonGC)
+  } else if (C.getLangOpts().ObjC &&
+             CGM.getLangOpts().getGC() == LangOptions::NonGC)
     info.HasCapturedVariableLayout = true;
 
   if (block->doesNotEscape())
@@ -784,7 +784,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr) {
 
 llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
   bool IsOpenCL = CGM.getContext().getLangOpts().OpenCL;
-  auto GenVoidPtrTy =
+  llvm::PointerType *GenVoidPtrTy =
       IsOpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy;
   LangAS GenVoidPtrAddr = IsOpenCL ? LangAS::opencl_generic : LangAS::Default;
   auto GenVoidPtrSize = CharUnits::fromQuantity(
@@ -818,9 +818,6 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                                    : CGM.getNSConcreteStackBlock();
     isa = blockISA;
 
-    // Build the block descriptor.
-    descriptor = buildBlockDescriptor(CGM, blockInfo);
-
     // Compute the initial on-stack block flags.
     if (!CGM.getCodeGenOpts().DisableBlockSignatureString)
       flags = BLOCK_HAS_SIGNATURE;
@@ -834,6 +831,9 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       flags |= BLOCK_USE_STRET;
     if (blockInfo.NoEscape)
       flags |= BLOCK_IS_NOESCAPE | BLOCK_IS_GLOBAL;
+
+    // Build the block descriptor.
+    descriptor = buildBlockDescriptor(CGM, blockInfo);
   }
 
   auto projectField = [&](unsigned index, const Twine &name) -> Address {
@@ -884,11 +884,25 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
           llvm::ConstantInt::get(IntTy, blockInfo.BlockAlign.getQuantity()),
           getIntSize(), "block.align");
     }
-    addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
-    if (!IsOpenCL)
-      addHeaderField(descriptor, getPointerSize(), "block.descriptor");
-    else if (auto *Helper =
-                 CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+
+    if (!IsOpenCL) {
+      llvm::Value *blockFnPtr =
+          llvm::ConstantExpr::getBitCast(InvokeFn, VoidPtrTy);
+      QualType type = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+      addSignedHeaderField(
+          blockFnPtr,
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers,
+          GlobalDecl(), type, getPointerSize(), "block.invoke");
+
+      addSignedHeaderField(
+          descriptor, CGM.getCodeGenOpts().PointerAuth.BlockDescriptorPointers,
+          GlobalDecl(), type, getPointerSize(), "block.descriptor");
+    } else if (auto *Helper =
+                   CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
       for (auto I : Helper->getCustomFieldValues(*this, blockInfo)) {
         addHeaderField(
             I.first,
@@ -896,7 +910,8 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                 CGM.getDataLayout().getTypeAllocSize(I.first->getType())),
             I.second);
       }
-    }
+    } else
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
   }
 
   // Finally, capture all the values into the block.
@@ -1167,6 +1182,8 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
   ASTContext &Ctx = getContext();
   CallArgList Args;
 
+  llvm::Value *FuncPtr = nullptr;
+
   if (getLangOpts().OpenCL) {
     // For OpenCL, BlockPtr is already casted to generic block literal.
 
@@ -1186,7 +1203,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     if (!isa<ParmVarDecl>(E->getCalleeDecl()))
       Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
     else {
-      llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
+      FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
       Func = Builder.CreateAlignedLoad(GenericVoidPtrTy, FuncPtr,
                                        getPointerAlign());
     }
@@ -1195,7 +1212,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     BlockPtr =
         Builder.CreatePointerCast(BlockPtr, UnqualPtrTy, "block.literal");
     // Get pointer to the block invoke function
-    llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
+    FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
 
     // First argument is a block literal casted to a void pointer
     BlockPtr = Builder.CreatePointerCast(BlockPtr, VoidPtrTy);
@@ -1212,7 +1229,15 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     CGM.getTypes().arrangeBlockFunctionCall(Args, FuncTy);
 
   // Prepare the callee.
-  CGCallee Callee(CGCalleeInfo(), Func);
+  CGPointerAuthInfo PointerAuth;
+  if (auto &AuthSchema =
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers) {
+    assert(FuncPtr != nullptr && "Missing function pointer for AuthInfo");
+    PointerAuth =
+        EmitPointerAuthInfo(AuthSchema, FuncPtr, GlobalDecl(), FnType);
+  }
+
+  CGCallee Callee(CGCalleeInfo(), Func, PointerAuth);
 
   // And call the block.
   return EmitCall(FnInfo, Callee, ReturnValue, Args, CallOrInvoke);
@@ -1296,14 +1321,15 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
 
   bool IsOpenCL = CGM.getLangOpts().OpenCL;
   bool IsWindows = CGM.getTarget().getTriple().isOSWindows();
+  auto &CGOPointerAuth = CGM.getCodeGenOpts().PointerAuth;
   if (!IsOpenCL) {
     // isa
     if (IsWindows)
       fields.addNullPointer(CGM.Int8PtrPtrTy);
     else
       fields.addSignedPointer(CGM.getNSConcreteGlobalBlock(),
-                              CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers,
-                              GlobalDecl(), QualType());
+                              CGOPointerAuth.ObjCIsaPointers, GlobalDecl(),
+                              QualType());
 
     // __flags
     BlockFlags flags = BLOCK_IS_GLOBAL;
@@ -1322,11 +1348,20 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
   }
 
   // Function
-  fields.add(blockFn);
+  if (auto &Schema = CGOPointerAuth.BlockInvocationFunctionPointers) {
+    QualType FnType = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+    fields.addSignedPointer(blockFn, Schema, GlobalDecl(), FnType);
+  } else
+    fields.add(blockFn);
 
   if (!IsOpenCL) {
     // Descriptor
-    fields.add(buildBlockDescriptor(CGM, blockInfo));
+    llvm::Constant *Descriptor = buildBlockDescriptor(CGM, blockInfo);
+    fields.addSignedPointer(Descriptor, CGOPointerAuth.BlockDescriptorPointers,
+                            GlobalDecl(), QualType());
   } else if (auto *Helper =
                  CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
     for (auto *I : Helper->getCustomFieldValues(CGM, blockInfo)) {
@@ -1996,8 +2031,8 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
         // it. It's not quite worth the annoyance to avoid creating it in the
         // first place.
         if (!needsEHCleanup(captureType.isDestructedType()))
-          if (auto *I =
-                  cast_or_null<llvm::Instruction>(dstField.getBasePointer()))
+          if (auto *I = cast_or_null<llvm::Instruction>(
+                  dstField.getPointerIfNotSigned()))
             I->eraseFromParent();
       }
       break;
@@ -2731,8 +2766,16 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   unsigned nextHeaderIndex = 0;
   CharUnits nextHeaderOffset;
   auto storeHeaderField = [&](llvm::Value *value, CharUnits fieldSize,
-                              const Twine &name) {
+                              const Twine &name, bool isFunction = false) {
     auto fieldAddr = Builder.CreateStructGEP(addr, nextHeaderIndex, name);
+    if (isFunction) {
+      if (auto &Schema = CGM.getCodeGenOpts()
+                             .PointerAuth.BlockByrefHelperFunctionPointers) {
+        auto PointerAuth = EmitPointerAuthInfo(
+            Schema, fieldAddr.emitRawPointer(*this), GlobalDecl(), QualType());
+        value = EmitPointerAuthSign(PointerAuth, value);
+      }
+    }
     Builder.CreateStore(value, fieldAddr);
 
     nextHeaderIndex++;
@@ -2815,10 +2858,10 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   storeHeaderField(V, getIntSize(), "byref.size");
 
   if (helpers) {
-    storeHeaderField(helpers->CopyHelper, getPointerSize(),
-                     "byref.copyHelper");
+    storeHeaderField(helpers->CopyHelper, getPointerSize(), "byref.copyHelper",
+                     /*isFunction=*/true);
     storeHeaderField(helpers->DisposeHelper, getPointerSize(),
-                     "byref.disposeHelper");
+                     "byref.disposeHelper", /*isFunction=*/true);
   }
 
   if (ByRefHasLifetime && HasByrefExtendedLayout) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 84be422..ad318f2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -725,7 +725,7 @@ public:
   };
 
   /// Header for data within LifetimeExtendedCleanupStack.
-  struct LifetimeExtendedCleanupHeader {
+  struct alignas(uint64_t) LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
     unsigned Size;
     /// The kind of cleanup to push.
@@ -947,7 +947,8 @@ public:
         LifetimeExtendedCleanupStack.size() + sizeof(Header) + Header.Size +
         (Header.IsConditional ? sizeof(ActiveFlag) : 0));
 
-    static_assert(sizeof(Header) % alignof(T) == 0,
+    static_assert((alignof(LifetimeExtendedCleanupHeader) == alignof(T)) &&
+                      (alignof(T) == alignof(RawAddress)),
                   "Cleanup will be allocated on misaligned address");
     char *Buffer = &LifetimeExtendedCleanupStack[OldSize];
     new (Buffer) LifetimeExtendedCleanupHeader(Header);
diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h
index ed11dc2..54f6cea 100644
--- a/clang/lib/CodeGen/EHScopeStack.h
+++ b/clang/lib/CodeGen/EHScopeStack.h
@@ -143,7 +143,7 @@ public:
   ///
   /// Cleanup implementations should generally be declared in an
   /// anonymous namespace.
-  class Cleanup {
+  class alignas(uint64_t) Cleanup {
     // Anchor the construction vtable.
     virtual void anchor();
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 293504ed..29b7180 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1736,7 +1736,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
                     options::OPT_fno_ptrauth_objc_interface_sel);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_class_ro,
                     options::OPT_fno_ptrauth_objc_class_ro);
-
   if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
     handlePAuthABI(Args, CmdArgs);
 
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7ab41e9..547e315 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -937,6 +937,8 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
 
       if (Args.hasArg(options::OPT_fopenmp_force_usm))
         CmdArgs.push_back("-fopenmp-force-usm");
+      Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                      options::OPT_fno_openmp_simd);
 
       // FIXME: Clang supports a whole bunch more flags here.
       break;
@@ -952,6 +954,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getSpelling() << A->getValue();
       break;
     }
+  } else {
+    Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                    options::OPT_fno_openmp_simd);
   }
 
   // Pass the path to compiler resource files.
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 2ea3ed7..a4d1896 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1542,6 +1542,16 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
           Discrimination::Constant, InitFiniPointerConstantDiscriminator);
     }
 
+    Opts.BlockInvocationFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockByrefHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockDescriptorPointers =
+        PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                          BlockDescriptorConstantDiscriminator);
+
     Opts.ObjCMethodListFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
     Opts.ObjCMethodListPointer =
@@ -3621,7 +3631,6 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
   Opts.AArch64JumpTableHardening =
       Args.hasArg(OPT_faarch64_jump_table_hardening);
-
   Opts.PointerAuthObjcIsa = Args.hasArg(OPT_fptrauth_objc_isa);
   Opts.PointerAuthObjcClassROPointers = Args.hasArg(OPT_fptrauth_objc_class_ro);
   Opts.PointerAuthObjcInterfaceSel =
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 35258d4..0d627488 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -2712,9 +2712,8 @@ _mm256_subs_epu16(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
 ///    of the result.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
 }
 
@@ -2747,9 +2746,8 @@ _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
@@ -2781,9 +2779,8 @@ _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
 }
 
@@ -2811,9 +2808,8 @@ _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
 }
 
@@ -2845,9 +2841,8 @@ _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
 ///    of the result.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
 }
 
@@ -2880,9 +2875,8 @@ _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
 }
 
@@ -2914,9 +2908,8 @@ _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
 }
 
@@ -2944,9 +2937,8 @@ _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
 ///    elements of the result.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
 }
 
@@ -2997,9 +2989,8 @@ _mm256_stream_load_si256(const void *__V)
 /// \param __X
 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
 /// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_broadcastss_ps(__m128 __X)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastss_ps(__m128 __X) {
   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
 }
 
@@ -3014,9 +3005,8 @@ _mm_broadcastss_ps(__m128 __X)
 /// \param __a
 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
 /// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_broadcastsd_pd(__m128d __a)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastsd_pd(__m128d __a) {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
@@ -3031,9 +3021,8 @@ _mm_broadcastsd_pd(__m128d __a)
 /// \param __X
 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
 /// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcastss_ps(__m128 __X)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastss_ps(__m128 __X) {
   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3048,9 +3037,8 @@ _mm256_broadcastss_ps(__m128 __X)
 /// \param __X
 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcastsd_pd(__m128d __X)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastsd_pd(__m128d __X) {
   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
 }
 
@@ -3064,9 +3052,8 @@ _mm256_broadcastsd_pd(__m128d __X)
 /// \param __X
 ///    A 128-bit integer vector to be broadcast.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastsi128_si256(__m128i __X)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastsi128_si256(__m128i __X) {
   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
 }
 
@@ -3156,9 +3143,8 @@ _mm256_broadcastsi128_si256(__m128i __X)
 /// \param __X
 ///    A 128-bit integer vector whose low byte will be broadcast.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastb_epi8(__m128i __X)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastb_epi8(__m128i __X) {
   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3172,9 +3158,8 @@ _mm256_broadcastb_epi8(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastw_epi16(__m128i __X)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastw_epi16(__m128i __X) {
   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3188,9 +3173,8 @@ _mm256_broadcastw_epi16(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastd_epi32(__m128i __X)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastd_epi32(__m128i __X) {
   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3204,9 +3188,8 @@ _mm256_broadcastd_epi32(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastq_epi64(__m128i __X)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcastq_epi64(__m128i __X) {
   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
 }
 
@@ -3220,9 +3203,8 @@ _mm256_broadcastq_epi64(__m128i __X)
 /// \param __X
 ///    A 128-bit integer vector whose low byte will be broadcast.
 /// \returns A 128-bit integer vector containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastb_epi8(__m128i __X)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastb_epi8(__m128i __X) {
   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3236,9 +3218,8 @@ _mm_broadcastb_epi8(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
 /// \returns A 128-bit vector of [8 x i16] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastw_epi16(__m128i __X)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastw_epi16(__m128i __X) {
   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
@@ -3252,9 +3233,8 @@ _mm_broadcastw_epi16(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastd_epi32(__m128i __X)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastd_epi32(__m128i __X) {
   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
 }
 
@@ -3268,9 +3248,8 @@ _mm_broadcastd_epi32(__m128i __X)
 /// \param __X
 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastq_epi64(__m128i __X)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcastq_epi64(__m128i __X) {
   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 8867832..118e1cb 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1247,7 +1247,7 @@ _mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
   __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
                                           8,  64+8,   9, 64+9,
@@ -1282,7 +1282,7 @@ _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
                                         (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
                                           4,  32+4,   5, 32+5,
@@ -1309,7 +1309,7 @@ _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
                                        (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
                                           0,  64+0,   1, 64+1,
@@ -1344,7 +1344,7 @@ _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
                                         (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
                                           0,  32+0,   1, 32+1,
@@ -1881,9 +1881,8 @@ _mm512_movm_epi16 (__mmask32 __A)
   return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastb_epi8 (__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastb_epi8(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -1923,9 +1922,8 @@ _mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
                                               (__v32hi) _mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastw_epi16 (__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastw_epi16(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index 62325b9..87d16b47 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1084,10 +1084,8 @@ _mm512_movepi64_mask (__m512i __A)
   return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
 }
 
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x2 (__m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_f32x2(__m128 __A) {
   return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
                                          0, 1, 0, 1, 0, 1, 0, 1,
                                          0, 1, 0, 1, 0, 1, 0, 1);
@@ -1109,9 +1107,8 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x8(__m256 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_f32x8(__m256 __A) {
   return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
                                          0, 1, 2, 3, 4, 5, 6, 7,
                                          0, 1, 2, 3, 4, 5, 6, 7);
@@ -1133,9 +1130,8 @@ _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
                                            (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x2(__m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_f64x2(__m128d __A) {
   return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
@@ -1156,9 +1152,8 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
                                             (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x2 (__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_i32x2(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
                                           0, 1, 0, 1, 0, 1, 0, 1,
                                           0, 1, 0, 1, 0, 1, 0, 1);
@@ -1180,9 +1175,8 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
                                              (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x8(__m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_i32x8(__m256i __A) {
   return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
                                           0, 1, 2, 3, 4, 5, 6, 7,
                                           0, 1, 2, 3, 4, 5, 6, 7);
@@ -1204,9 +1198,8 @@ _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x2(__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_i64x2(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 90f883b..05a291e 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -218,9 +218,8 @@ _mm512_undefined_epi32(void)
   return (__m512i)__builtin_ia32_undef512();
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastd_epi32 (__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastd_epi32(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
@@ -241,9 +240,8 @@ _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
                                              (__v16si) _mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastq_epi64 (__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastq_epi64(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
@@ -344,9 +342,8 @@ _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcastss_ps(__m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastss_ps(__m128 __A) {
   return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
@@ -389,9 +386,8 @@ _mm512_set4_ps(float __A, float __B, float __C, float __D) {
 #define _mm512_setr4_ps(e0,e1,e2,e3)                \
   _mm512_set4_ps((e3),(e2),(e1),(e0))
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcastsd_pd(__m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcastsd_pd(__m128d __A) {
   return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
@@ -4203,9 +4199,8 @@ _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
                                           (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
                                           2,    18,    3,    19,
                                           2+4,  18+4,  3+4,  19+4,
@@ -4229,9 +4224,8 @@ _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
                                        (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
                                           0,    16,    1,    17,
                                           0+4,  16+4,  1+4,  17+4,
@@ -4255,9 +4249,8 @@ _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
                                        (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpackhi_epi64(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
                                           1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
 }
@@ -4278,9 +4271,8 @@ _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
                                         (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_unpacklo_epi64(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
                                           0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
 }
@@ -6799,9 +6791,8 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
                                             (__v4sf)_mm_setzero_ps(), \
                                             (__mmask8)(U), (int)(R)))
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x4(__m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_f32x4(__m128 __A) {
   return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
                                          0, 1, 2, 3, 0, 1, 2, 3,
                                          0, 1, 2, 3, 0, 1, 2, 3);
@@ -6823,9 +6814,8 @@ _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
                                            (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x4(__m256d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_f64x4(__m256d __A) {
   return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
@@ -6846,9 +6836,8 @@ _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
                                             (__v8df)_mm512_setzero_pd());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x4(__m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_i32x4(__m128i __A) {
   return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
                                           0, 1, 2, 3, 0, 1, 2, 3,
                                           0, 1, 2, 3, 0, 1, 2, 3);
@@ -6870,9 +6859,8 @@ _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
                                            (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x4(__m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_broadcast_i64x4(__m256i __A) {
   return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 272cdd8..ceebd09 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -24,6 +24,14 @@
                  __target__("avx512vl,avx512dq,no-evex512"),                   \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
   return (__m256i) ((__v4du) __A * (__v4du) __B);
@@ -956,9 +964,8 @@ _mm256_movepi64_mask (__m256i __A)
   return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x2 (__m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_f32x2(__m128 __A) {
   return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
@@ -979,9 +986,8 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f64x2(__m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_f64x2(__m128d __A) {
   return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
                                           0, 1, 0, 1);
 }
@@ -1002,9 +1008,8 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
                                             (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcast_i32x2 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_broadcast_i32x2(__m128i __A) {
   return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
                                           0, 1, 0, 1);
 }
@@ -1025,9 +1030,8 @@ _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
                                              (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x2 (__m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_i32x2(__m128i __A) {
   return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
@@ -1048,9 +1052,8 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
                                              (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i64x2(__m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_i64x2(__m128i __A) {
   return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
                                           0, 1, 0, 1);
 }
@@ -1169,5 +1172,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 366adab..09b76d4 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -23,6 +23,14 @@
                  __target__("avx512vl,no-evex512"),                            \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 typedef short __v2hi __attribute__((__vector_size__(4)));
 typedef char __v4qi __attribute__((__vector_size__(4)));
 typedef char __v2qi __attribute__((__vector_size__(2)));
@@ -6744,9 +6752,8 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
                 (__mmask8) __U);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x4(__m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_f32x4(__m128 __A) {
   return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
@@ -6767,9 +6774,8 @@ _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
                                             (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x4(__m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_broadcast_i32x4(__m128i __A) {
   return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
@@ -8385,5 +8391,7 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
 
 #endif /* __AVX512VLINTRIN_H */
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 1973ccb..58caded 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4421,8 +4421,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
 ///    Bits [119:112] are written to bits [111:104] of the result. \n
 ///    Bits [127:120] are written to bits [127:120] of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
-                                                               __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_epi8(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector(
       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
@@ -4449,8 +4449,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
 ///    Bits [111:96] are written to bits [95:80] of the result. \n
 ///    Bits [127:112] are written to bits [127:112] of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
 }
@@ -4472,8 +4472,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_epi32(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
                                           4 + 3);
 }
@@ -4493,8 +4493,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
 ///    A 128-bit vector of [2 x i64]. \n
 ///    Bits [127:64] are written to bits [127:64] of the destination.
 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_epi64(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
 }
 
@@ -4527,8 +4527,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
 ///    Bits [55:48] are written to bits [111:104] of the result. \n
 ///    Bits [63:56] are written to bits [127:120] of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
-                                                               __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_epi8(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector(
       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
@@ -4556,8 +4556,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
 ///    Bits [47:32] are written to bits [95:80] of the result. \n
 ///    Bits [63:48] are written to bits [127:112] of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
 }
@@ -4579,8 +4579,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_epi32(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
                                           4 + 1);
 }
@@ -4600,8 +4600,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
 ///    A 128-bit vector of [2 x i64]. \n
 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
-                                                                __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_epi64(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
 }
 
diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp
index 9a010fb..74e2500 100644
--- a/clang/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp
@@ -1161,6 +1161,12 @@ bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks,
 
   while (true) {
     switch (Tok.getKind()) {
+    case tok::ellipsis:
+      // We found an elipsis at the end of the parameter list;
+      // it is not part of a parameter declaration.
+      if (ParenCount == 1 && NextToken().is(tok::r_paren))
+        return true;
+      goto consume_token;
     case tok::comma:
       // If we might be in a template, perform a tentative parse to check.
       if (!AngleCount)
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index e57a789..7d190ea 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1418,6 +1418,10 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     // parameter list was specified.
     CurTemplateDepthTracker.addDepth(1);
 
+  // Late attributes are parsed in the same scope as the function body.
+  if (LateParsedAttrs)
+    ParseLexedAttributeList(*LateParsedAttrs, Res, false, true);
+
   if (SkipFunctionBodies && (!Res || Actions.canSkipFunctionBody(Res)) &&
       trySkippingFunctionBody()) {
     BodyScope.Exit();
@@ -1442,10 +1446,6 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
   } else
     Actions.ActOnDefaultCtorInitializers(Res);
 
-  // Late attributes are parsed in the same scope as the function body.
-  if (LateParsedAttrs)
-    ParseLexedAttributeList(*LateParsedAttrs, Res, false, true);
-
   return ParseFunctionStatementBody(Res, BodyScope);
 }
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 90774037..2dc4ee7 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -41,6 +41,7 @@
 #include "clang/AST/UnresolvedSet.h"
 #include "clang/Basic/AddressSpaces.h"
 #include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangOptions.h"
@@ -7656,17 +7657,13 @@ bool EquatableFormatArgument::VerifyCompatible(
     break;
 
   case MK::NoMatchSignedness:
-    if (!S.getDiagnostics().isIgnored(
-            diag::warn_format_conversion_argument_type_mismatch_signedness,
-            ElementLoc)) {
-      EmitDiagnostic(S,
-                     S.PDiag(diag::warn_format_cmp_specifier_sign_mismatch)
-                         << buildFormatSpecifier()
-                         << Other.buildFormatSpecifier(),
-                     FmtExpr, InFunctionCall);
-      HadError = S.Diag(Other.ElementLoc, diag::note_format_cmp_with)
-                 << 0 << Other.Range;
-    }
+    EmitDiagnostic(S,
+                   S.PDiag(diag::warn_format_cmp_specifier_sign_mismatch)
+                       << buildFormatSpecifier()
+                       << Other.buildFormatSpecifier(),
+                   FmtExpr, InFunctionCall);
+    HadError = S.Diag(Other.ElementLoc, diag::note_format_cmp_with)
+               << 0 << Other.Range;
     break;
   }
   return !HadError;
@@ -8203,11 +8200,14 @@ static analyze_format_string::ArgType::MatchKind
 handleFormatSignedness(analyze_format_string::ArgType::MatchKind Match,
                        DiagnosticsEngine &Diags, SourceLocation Loc) {
   if (Match == analyze_format_string::ArgType::NoMatchSignedness) {
-    Match =
+    if (Diags.isIgnored(
+            diag::warn_format_conversion_argument_type_mismatch_signedness,
+            Loc) ||
         Diags.isIgnored(
-            diag::warn_format_conversion_argument_type_mismatch_signedness, Loc)
-            ? analyze_format_string::ArgType::Match
-            : analyze_format_string::ArgType::NoMatch;
+            // Arbitrary -Wformat diagnostic to detect -Wno-format:
+            diag::warn_format_conversion_argument_type_mismatch, Loc)) {
+      return analyze_format_string::ArgType::Match;
+    }
   }
   return Match;
 }
@@ -8424,8 +8424,10 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
       case ArgType::Match:
       case ArgType::MatchPromotion:
       case ArgType::NoMatchPromotionTypeConfusion:
-      case ArgType::NoMatchSignedness:
         llvm_unreachable("expected non-matching");
+      case ArgType::NoMatchSignedness:
+        Diag = diag::warn_format_conversion_argument_type_mismatch_signedness;
+        break;
       case ArgType::NoMatchPedantic:
         Diag = diag::warn_format_conversion_argument_type_mismatch_pedantic;
         break;
@@ -8750,9 +8752,10 @@ bool CheckScanfHandler::HandleScanfSpecifier(
   analyze_format_string::ArgType::MatchKind Match =
       AT.matchesType(S.Context, Ex->getType());
   Match = handleFormatSignedness(Match, S.getDiagnostics(), Ex->getExprLoc());
-  bool Pedantic = Match == analyze_format_string::ArgType::NoMatchPedantic;
   if (Match == analyze_format_string::ArgType::Match)
     return true;
+  bool Pedantic = Match == analyze_format_string::ArgType::NoMatchPedantic;
+  bool Signedness = Match == analyze_format_string::ArgType::NoMatchSignedness;
 
   ScanfSpecifier fixedFS = FS;
   bool Success = fixedFS.fixType(Ex->getType(), Ex->IgnoreImpCasts()->getType(),
@@ -8760,7 +8763,9 @@ bool CheckScanfHandler::HandleScanfSpecifier(
 
   unsigned Diag =
       Pedantic ? diag::warn_format_conversion_argument_type_mismatch_pedantic
-               : diag::warn_format_conversion_argument_type_mismatch;
+      : Signedness
+          ? diag::warn_format_conversion_argument_type_mismatch_signedness
+          : diag::warn_format_conversion_argument_type_mismatch;
 
   if (Success) {
     // Get the fix string from the fixed format specifier.
@@ -14120,7 +14125,6 @@ void Sema::CheckCompletedExpr(Expr *E, SourceLocation CheckLoc,
     CheckUnsequencedOperations(E);
   if (!IsConstexpr && !E->isValueDependent())
     CheckForIntOverflow(E);
-  DiagnoseMisalignedMembers();
 }
 
 void Sema::CheckBitFieldInitialization(SourceLocation InitLoc,
@@ -15565,11 +15569,12 @@ void Sema::CheckArgumentWithTypeTag(const ArgumentWithTypeTagAttr *Attr,
 
 void Sema::AddPotentialMisalignedMembers(Expr *E, RecordDecl *RD, ValueDecl *MD,
                                          CharUnits Alignment) {
-  MisalignedMembers.emplace_back(E, RD, MD, Alignment);
+  currentEvaluationContext().MisalignedMembers.emplace_back(E, RD, MD,
+                                                            Alignment);
 }
 
 void Sema::DiagnoseMisalignedMembers() {
-  for (MisalignedMember &m : MisalignedMembers) {
+  for (MisalignedMember &m : currentEvaluationContext().MisalignedMembers) {
     const NamedDecl *ND = m.RD;
     if (ND->getName().empty()) {
       if (const TypedefNameDecl *TD = m.RD->getTypedefNameForAnonDecl())
@@ -15578,7 +15583,7 @@ void Sema::DiagnoseMisalignedMembers() {
     Diag(m.E->getBeginLoc(), diag::warn_taking_address_of_packed_member)
         << m.MD << ND << m.E->getSourceRange();
   }
-  MisalignedMembers.clear();
+  currentEvaluationContext().MisalignedMembers.clear();
 }
 
 void Sema::DiscardMisalignedMemberAddress(const Type *T, Expr *E) {
@@ -15589,13 +15594,15 @@ void Sema::DiscardMisalignedMemberAddress(const Type *T, Expr *E) {
       cast<UnaryOperator>(E)->getOpcode() == UO_AddrOf) {
     auto *Op = cast<UnaryOperator>(E)->getSubExpr()->IgnoreParens();
     if (isa<MemberExpr>(Op)) {
-      auto *MA = llvm::find(MisalignedMembers, MisalignedMember(Op));
-      if (MA != MisalignedMembers.end() &&
+      auto &MisalignedMembersForExpr =
+          currentEvaluationContext().MisalignedMembers;
+      auto *MA = llvm::find(MisalignedMembersForExpr, MisalignedMember(Op));
+      if (MA != MisalignedMembersForExpr.end() &&
           (T->isDependentType() || T->isIntegerType() ||
            (T->isPointerType() && (T->getPointeeType()->isIncompleteType() ||
                                    Context.getTypeAlignInChars(
                                        T->getPointeeType()) <= MA->Alignment))))
-        MisalignedMembers.erase(MA);
+        MisalignedMembersForExpr.erase(MA);
     }
   }
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index cb59782..6581d4c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3653,7 +3653,9 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
   FunctionDecl *Old = OldD->getAsFunction();
   if (!Old) {
     if (UsingShadowDecl *Shadow = dyn_cast<UsingShadowDecl>(OldD)) {
-      if (New->getFriendObjectKind()) {
+      // We don't need to check the using friend pattern from other module unit
+      // since we should have diagnosed such cases in its unit already.
+      if (New->getFriendObjectKind() && !OldD->isInAnotherModuleUnit()) {
         Diag(New->getLocation(), diag::err_using_decl_friend);
         Diag(Shadow->getTargetDecl()->getLocation(),
              diag::note_using_decl_target);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8532039..237c068 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -18121,6 +18121,8 @@ void Sema::PopExpressionEvaluationContext() {
     MaybeODRUseExprs.insert_range(Rec.SavedMaybeODRUseExprs);
   }
 
+  DiagnoseMisalignedMembers();
+
   // Pop the current expression evaluation context off the stack.
   ExprEvalContexts.pop_back();
 }
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index e8a18243..aa54ff8 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -1054,13 +1054,17 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
 OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
 
-  // Based on discussions, having more than 1 'architecture' on a 'set' is
-  // nonsensical, so we're going to fix the standard to reflect this.  Implement
-  // the limitation, since the Dialect requires this.
-  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Set &&
+  // OpenACC Pull #550 (https://github.com/OpenACC/openacc-spec/pull/550)
+  // clarified that Init, Shutdown, and Set only support a single architecture.
+  // Though the dialect only requires it for 'set' as far as we know, we'll just
+  // implement all 3 here.
+  if ((Clause.getDirectiveKind() == OpenACCDirectiveKind::Init ||
+       Clause.getDirectiveKind() == OpenACCDirectiveKind::Shutdown ||
+       Clause.getDirectiveKind() == OpenACCDirectiveKind::Set) &&
       Clause.getDeviceTypeArchitectures().size() > 1) {
     SemaRef.Diag(Clause.getDeviceTypeArchitectures()[1].getLoc(),
-                 diag::err_acc_device_type_multiple_archs);
+                 diag::err_acc_device_type_multiple_archs)
+        << Clause.getDirectiveKind();
     return nullptr;
   }
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index d593d1d..ac64dd5 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6275,7 +6275,9 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From,
                                                    QualType T, CCEKind CCE,
                                                    NamedDecl *Dest,
                                                    APValue &PreNarrowingValue) {
-  assert((S.getLangOpts().CPlusPlus11 || CCE == CCEKind::TempArgStrict) &&
+  [[maybe_unused]] bool isCCEAllowedPreCXX11 =
+      (CCE == CCEKind::TempArgStrict || CCE == CCEKind::ExplicitBool);
+  assert((S.getLangOpts().CPlusPlus11 || isCCEAllowedPreCXX11) &&
          "converted constant expression outside C++11 or TTP matching");
 
   if (checkPlaceholderForOverload(S, From))
diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 40c3edb..b67f7991 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -131,3 +131,9 @@ void Test_TemplatedFunctionVariadic(int arg, ...);
 // CHECK:        ParamCommandComment{{.*}} [in] implicitly Param="..."
 // CHECK-NEXT:     ParagraphComment
 // CHECK-NEXT:       TextComment{{.*}} Text=" More arguments"
+
+/// \thread_safe test for underscore in special command
+int Test_UnderscoreInSpecialCommand;
+// CHECK:      VarDecl{{.*}}Test_UnderscoreInSpecialCommand 'int'
+// CHECK:        InlineCommandComment{{.*}} Name="thread_safe" RenderNormal
+// CHECK-NEXT:     TextComment{{.*}} Text=" test for underscore in special command"
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 141b67e..60028af4 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -129,31 +129,50 @@ void func2() {
 }
 
 // CIR: %[[ARR2:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
-// CIR: %[[ELE_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_2_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
-// CIR: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: cir.store{{.*}} %[[V1]], %[[ARR_2_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
+// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
+// CIR: cir.store{{.*}} %[[FIVE]], %[[ARR_0]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_2_PTR]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
-// CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR: %[[LOAD_1:.*]] = cir.load{{.*}} %[[ELE_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CIR: %[[V2:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: cir.store{{.*}} %[[V2]], %[[LOAD_1]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[LOAD_1]] : !cir.ptr<!s32i>, %[[OFFSET_1]] : !s64i), !cir.ptr<!s32i>
-// CIR: cir.store{{.*}} %[[ELE_1_PTR]], %[[ELE_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!s32i>, %[[OFFSET_0]] : !s64i), !cir.ptr<!s32i>
+// CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ARR_PTR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
+// CIR: %[[ARR_END:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
+// CIR: cir.do {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride(%[[ARR_CUR]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
+// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:   cir.yield
+// CIR: } while {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!s32i>, !cir.bool
+// CIR:   cir.condition(%[[CMP]])
+// CIR: }
 
 // LLVM: define{{.*}} void @_Z5func2v()
-// LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
-// LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// LLVM:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
-// LLVM:  store i32 5, ptr %[[ARR_PTR]], align 4
-// LLVM:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// LLVM:  store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
-// LLVM:  %[[TMP2:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:  store i32 0, ptr %[[TMP2]], align 4
-// LLVM:  %[[ELE_1:.*]] = getelementptr i32, ptr %[[TMP2]], i64 1
-// LLVM:  store ptr %[[ELE_1]], ptr %[[TMP]], align 8
+// LLVM:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
+// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
+// LLVM:   store i32 5, ptr %[[ARR_PTR]], align 4
+// LLVM:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// LLVM:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
+// LLVM:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
+// LLVM:   br label %[[LOOP_BODY:.*]]
+// LLVM: [[LOOP_NEXT:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// LLVM: [[LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   store i32 0, ptr %[[CUR]], align 4
+// LLVM:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
+// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// LLVM:   br label %[[LOOP_NEXT:.*]]
+// LLVM: [[LOOP_END]]:
+// LLVM:   ret void
 
 // OGCG: %[[ARR:.*]] = alloca [2 x i32], align 4
 // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN2_ARR]], i64 8, i1 false)
@@ -270,27 +289,46 @@ void func5() {
 // CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %6 = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: cir.store{{.*}} %6, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
-// CIR: %7 = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %8 = cir.const #cir.zero : !cir.array<!s32i x 1>
-// CIR: cir.store{{.*}} %8, %7 : !cir.array<!s32i x 1>, !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %10 = cir.ptr_stride(%7 : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET_1]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: cir.store{{.*}} %10, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
+// CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: cir.store{{.*}} %[[ARR_1]], %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
+// CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s64i
+// CIR: %[[ARR_END:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[TWO]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: cir.do {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !cir.array<!s32i x 1>
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[ARR_CUR]] : !cir.array<!s32i x 1>, !cir.ptr<!cir.array<!s32i x 1>>
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride(%[[ARR_CUR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[ONE]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>
+// CIR:   cir.yield
+// CIR: } while {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, !cir.ptr<!cir.array<!s32i x 1>>
+// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!cir.array<!s32i x 1>>, !cir.bool
+// CIR:   cir.condition(%[[CMP]])
+// CIR: }
 
 // LLVM: define{{.*}} void @_Z5func5v()
-// LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
-// LLVM:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
-// LLVM:  %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// LLVM:  store i32 5, ptr %[[ARR_0]], align 4
-// LLVM:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// LLVM:  store ptr %[[ARR_1]], ptr %[[TMP]], align 8
-// LLVM:  %[[ARR_1_VAL:.*]] = load ptr, ptr %[[TMP]], align 8
-// LLVM:  store [1 x i32] zeroinitializer, ptr %[[ARR_1_VAL]], align 4
-// LLVM:  %[[ARR_1_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_1_VAL]], i64 1
-// LLVM:  store ptr %[[ARR_1_PTR]], ptr %[[TMP]], align 8
+// LLVM:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
+// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
+// LLVM:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:   store i32 5, ptr %[[ARR_0]], align 4
+// LLVM:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// LLVM:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
+// LLVM:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
+// LLVM:   br label %[[LOOP_BODY:.*]]
+// LLVM: [[LOOP_NEXT:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// LLVM: [[LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
+// LLVM:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
+// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// LLVM:   br label %[[LOOP_NEXT:.*]]
+// LLVM: [[LOOP_END]]:
+// LLVM:   ret void
 
 // ORGC: %[[ARR:.*]] = alloca [2 x [1 x i32]], align 4
 // ORGC: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[ARR]], ptr align 4 @[[FUN5_ARR]], i64 8, i1 false)
@@ -335,25 +373,44 @@ void func7() {
 }
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>, ["arr", init]
-// CIR: %[[ARR_TMP:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>), !cir.ptr<!cir.ptr<!s32i>>
-// CIR: cir.store{{.*}} %[[ARR_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
-// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR: %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
-// CIR: cir.store{{.*}} %[[NULL_PTR]], %[[TMP]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
-// CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
-// CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[TMP]] : !cir.ptr<!cir.ptr<!s32i>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
-// CIR: cir.store{{.*}} %[[ELE_PTR]], %[[ARR_TMP]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
+// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>), !cir.ptr<!cir.ptr<!s32i>>
+// CIR: cir.store{{.*}} %[[ARR_0]], %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ARR_END:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ONE]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
+// CIR: cir.do {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:   %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR:   cir.store{{.*}} %[[NULL_PTR]], %[[ARR_CUR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:   %[[ARR_NEXT:.*]] = cir.ptr_stride(%[[ARR_CUR]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ONE]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
+// CIR:   cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CIR:   cir.yield
+// CIR: } while {
+// CIR:   %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:   %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]]) : !cir.ptr<!cir.ptr<!s32i>>, !cir.bool
+// CIR:   cir.condition(%[[CMP]])
+// CIR: }
 
 // LLVM: define{{.*}} void @_Z5func7v()
-// LLVM:  %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
-// LLVM:  %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
-// LLVM:  %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
-// LLVM:  store ptr %[[ELE_PTR]], ptr %[[ALLOCA]], align 8
-// LLVM:  %[[TMP:.*]] = load ptr, ptr %[[ALLOCA]], align 8
-// LLVM:  store ptr null, ptr %[[TMP]], align 8
-// LLVM:  %[[ELE:.*]] = getelementptr ptr, ptr %[[TMP]], i64 1
-// LLVM:  store ptr %[[ELE]], ptr %[[ALLOCA]], align 8
+// LLVM:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
+// LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
+// LLVM:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
+// LLVM:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
+// LLVM:   br label %[[LOOP_BODY:.*]]
+// LLVM: [[LOOP_NEXT:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// LLVM: [[LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// LLVM:   store ptr null, ptr %[[CUR]], align 8
+// LLVM:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
+// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// LLVM:   br label %[[LOOP_NEXT:.*]]
+// LLVM: [[LOOP_END]]:
+// LLVM:   ret void
 
 // OGCG: %[[ARR:.*]] = alloca [1 x ptr], align 8
 // OGCG: call void @llvm.memset.p0.i64(ptr align 8 %[[ARR]], i8 0, i64 8, i1 false)
diff --git a/clang/test/CIR/CodeGen/builtins.cpp b/clang/test/CIR/CodeGen/builtins.cpp
new file mode 100644
index 0000000..3d43821
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtins.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+double fabs(double x) {
+  return __builtin_fabs(x);
+}
+
+// CIR: {{.*}} = cir.fabs {{.*}} : !cir.double
+// LLVM: {{.*}} = call double @llvm.fabs.f64(double {{.*}})
+// OGCG: {{.*}} = call double @llvm.fabs.f64(double {{.*}})
diff --git a/clang/test/CIR/CodeGen/destructors.cpp b/clang/test/CIR/CodeGen/destructors.cpp
index de7718f..fde0732 100644
--- a/clang/test/CIR/CodeGen/destructors.cpp
+++ b/clang/test/CIR/CodeGen/destructors.cpp
@@ -55,3 +55,102 @@ struct inline_destructor {
 // CIR-NOT: cir.func {{.*}}inline_destructor{{.*}}
 // LLVM-NOT: define {{.*}}inline_destructor{{.*}}
 // OGCG-NOT: define {{.*}}inline_destructor{{.*}}
+
+struct array_element {~array_element();};
+void test_array_destructor() {
+  array_element arr[5]{};
+}
+
+// CIR: cir.func dso_local @_Z21test_array_destructorv()
+// CIR:   %[[ARR:.*]] = cir.alloca !cir.array<!rec_array_element x 5>, !cir.ptr<!cir.array<!rec_array_element x 5>>, ["arr", init]
+// CIR:   %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!rec_array_element>, !cir.ptr<!cir.ptr<!rec_array_element>>, ["arrayinit.temp", init]
+// CIR:   %[[BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>)
+// CIR:   cir.store{{.*}} %[[BEGIN]], %[[ARR_PTR]]
+// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5> : !s64i
+// CIR:   %[[ARR_END:.*]] = cir.ptr_stride(%[[BEGIN]] : !cir.ptr<!rec_array_element>, %[[FIVE]] : !s64i)
+// CIR:   cir.do {
+// CIR:     %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]]
+// CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:     %[[ARR_NEXT:.*]] = cir.ptr_stride(%[[ARR_CUR]] : !cir.ptr<!rec_array_element>, %[[ONE]] : !s64i)
+// CIR:     cir.store{{.*}} %[[ARR_NEXT]], %[[ARR_PTR]] : !cir.ptr<!rec_array_element>, !cir.ptr<!cir.ptr<!rec_array_element>>
+// CIR:     cir.yield
+// CIR:   } while {
+// CIR:     %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]]
+// CIR:     %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[ARR_END]])
+// CIR:     cir.condition(%[[CMP]])
+// CIR:   }
+// CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !u64i
+// CIR:   %[[BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>)
+// CIR:   %[[END:.*]] = cir.ptr_stride(%[[BEGIN]] : !cir.ptr<!rec_array_element>, %[[FOUR]] : !u64i)
+// CIR:   %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!rec_array_element>, !cir.ptr<!cir.ptr<!rec_array_element>>, ["__array_idx"]
+// CIR:   cir.store %[[END]], %[[ARR_PTR]]
+// CIR:   cir.do {
+// CIR:     %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]]
+// CIR:     cir.call @_ZN13array_elementD1Ev(%[[ARR_CUR]]) nothrow : (!cir.ptr<!rec_array_element>) -> ()
+// CIR:     %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s64i
+// CIR:     %[[ARR_NEXT:.*]] = cir.ptr_stride(%[[ARR_CUR]] : !cir.ptr<!rec_array_element>, %[[NEG_ONE]] : !s64i)
+// CIR:     cir.store %[[ARR_NEXT]], %[[ARR_PTR]]
+// CIR:     cir.yield
+// CIR:   } while {
+// CIR:     %[[ARR_CUR:.*]] = cir.load{{.*}} %[[ARR_PTR]]
+// CIR:     %[[CMP:.*]] = cir.cmp(ne, %[[ARR_CUR]], %[[BEGIN]])
+// CIR:     cir.condition(%[[CMP]])
+// CIR:   }
+
+// LLVM: define{{.*}} void @_Z21test_array_destructorv()
+// LLVM:   %[[ARR:.*]] = alloca [5 x %struct.array_element]
+// LLVM:   %[[TMP:.*]] = alloca ptr
+// LLVM:   %[[ARR_PTR:.*]] = getelementptr %struct.array_element, ptr %[[ARR]], i32 0
+// LLVM:   store ptr %[[ARR_PTR]], ptr %[[TMP]]
+// LLVM:   %[[END_PTR:.*]] = getelementptr %struct.array_element, ptr %[[ARR_PTR]], i64 5
+// LLVM:   br label %[[INIT_LOOP_BODY:.*]]
+// LLVM: [[INIT_LOOP_NEXT:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]]
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// LLVM:   br i1 %[[CMP]], label %[[INIT_LOOP_BODY]], label %[[INIT_LOOP_END:.*]]
+// LLVM: [[INIT_LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[TMP]]
+// LLVM:   %[[NEXT:.*]] = getelementptr %struct.array_element, ptr %[[CUR]], i64 1
+// LLVM:   store ptr %[[NEXT]], ptr %[[TMP]]
+// LLVM:   br label %[[INIT_LOOP_NEXT:.*]]
+// LLVM: [[INIT_LOOP_END]]:
+// LLVM:   %[[ARR_BEGIN:.*]] = getelementptr %struct.array_element, ptr %[[ARR]], i32 0
+// LLVM:   %[[ARR_END:.*]] = getelementptr %struct.array_element, ptr %[[ARR_BEGIN]], i64 4
+// LLVM:   %[[ARR_CUR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARR_END]], ptr %[[ARR_CUR]]
+// LLVM:   br label %[[DESTROY_LOOP_BODY:.*]]
+// LLVM: [[DESTROY_LOOP_NEXT:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[ARR_CUR]]
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[ARR_BEGIN]]
+// LLVM:   br i1 %[[CMP]], label %[[DESTROY_LOOP_BODY]], label %[[DESTROY_LOOP_END:.*]]
+// LLVM: [[DESTROY_LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[ARR_CUR]]
+// LLVM:   call void @_ZN13array_elementD1Ev(ptr %[[CUR]])
+// LLVM:   %[[PREV:.*]] = getelementptr %struct.array_element, ptr %[[CUR]], i64 -1
+// LLVM:   store ptr %[[PREV]], ptr %[[ARR_CUR]]
+// LLVM:   br label %[[DESTROY_LOOP_NEXT]]
+// LLVM: [[DESTROY_LOOP_END]]:
+// LLVM:   ret void
+
+// OGCG: define{{.*}} void @_Z21test_array_destructorv()
+// OGCG: entry:
+// OGCG:   %[[ARR:.*]] = alloca [5 x %struct.array_element]
+// OGCG:   %[[ARRAYINIT_END:.*]] = getelementptr inbounds %struct.array_element, ptr %[[ARR]], i64 5
+// OGCG:   br label %[[INIT_LOOP_BODY:.*]]
+// OGCG: [[INIT_LOOP_BODY]]:
+// OGCG:   %[[CUR:.*]] = phi ptr [ %[[ARR]], %entry ], [ %[[NEXT:.*]], %[[INIT_LOOP_BODY]] ]
+// OGCG:   %[[NEXT]] = getelementptr inbounds %struct.array_element, ptr %[[CUR]], i64 1
+// OGCG:   %[[CMP:.*]] = icmp eq ptr %[[NEXT]], %[[ARRAYINIT_END]]
+// OGCG:   br i1 %[[CMP]], label %[[INIT_LOOP_END:.*]], label %[[INIT_LOOP_BODY]]
+// OGCG: [[INIT_LOOP_END:.*]]:
+// OGCG:   %[[BEGIN:.*]] = getelementptr inbounds [5 x %struct.array_element], ptr %[[ARR]], i32 0, i32 0
+// OGCG:   %[[END:.*]] = getelementptr inbounds %struct.array_element, ptr %[[BEGIN]], i64 5
+// OGCG:   br label %[[DESTROY_LOOP_BODY:.*]]
+// OGCG: [[DESTROY_LOOP_BODY:.*]]:
+// OGCG:   %[[CUR:.*]] = phi ptr [ %[[END]], %[[INIT_LOOP_END]] ], [ %[[PREV:.*]], %[[DESTROY_LOOP_BODY]] ]
+// OGCG:   %[[PREV]] = getelementptr inbounds %struct.array_element, ptr %[[CUR]], i64 -1
+// OGCG:   call void @_ZN13array_elementD1Ev(ptr {{.*}} %[[PREV]])
+// OGCG:   %[[CMP:.*]] = icmp eq ptr %[[PREV]], %[[BEGIN]]
+// OGCG:   br i1 %[[CMP]], label %[[DESTROY_LOOP_END:.*]], label %[[DESTROY_LOOP_BODY]]
+// OGCG: [[DESTROY_LOOP_END:.*]]:
+// OGCG:   ret void
diff --git a/clang/test/CIR/CodeGen/label.c b/clang/test/CIR/CodeGen/label.c
new file mode 100644
index 0000000..2a515fc4
--- /dev/null
+++ b/clang/test/CIR/CodeGen/label.c
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+void label() {
+labelA:
+  return;
+}
+
+// CIR:  cir.func no_proto dso_local @label
+// CIR:    cir.label "labelA"
+// CIR:    cir.return
+
+// Note: We are not lowering to LLVM IR via CIR at this stage because that
+// process depends on the GotoSolver.
+
+// OGCG: define dso_local void @label
+// OGCG:   br label %labelA
+// OGCG: labelA:
+// OGCG:   ret void
+
+void multiple_labels() {
+labelB:
+labelC:
+  return;
+}
+
+// CIR:  cir.func no_proto dso_local @multiple_labels
+// CIR:    cir.label "labelB"
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "labelC"
+// CIR:    cir.return
+
+// OGCG: define dso_local void @multiple_labels
+// OGCG:   br label %labelB
+// OGCG: labelB:
+// OGCG:   br label %labelC
+// OGCG: labelC:
+// OGCG:   ret void
+
+void label_in_if(int cond) {
+  if (cond) {
+labelD:
+    cond++;
+  }
+}
+
+// CIR:  cir.func dso_local @label_in_if
+// CIR:      cir.if {{.*}} {
+// CIR:        cir.label "labelD"
+// CIR:        [[LOAD:%.*]] = cir.load align(4) [[COND:%.*]] : !cir.ptr<!s32i>, !s32i
+// CIR:        [[INC:%.*]] = cir.unary(inc, %3) nsw : !s32i, !s32i
+// CIR:        cir.store align(4) [[INC]], [[COND]] : !s32i, !cir.ptr<!s32i>
+// CIR:      }
+// CIR:    cir.return
+
+// OGCG: define dso_local void @label_in_if
+// OGCG: if.then:
+// OGCG:   br label %labelD
+// OGCG: labelD:
+// OGCG:   [[LOAD:%.*]] = load i32, ptr [[COND:%.*]], align 4
+// OGCG:   [[INC:%.*]] = add nsw i32 %1, 1
+// OGCG:   store i32 [[INC]], ptr [[COND]], align 4
+// OGCG:   br label %if.end
+// OGCG: if.end:
+// OGCG:   ret void
+
+void after_return() {
+  return;
+  label:
+}
+
+// CIR:  cir.func no_proto dso_local @after_return
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // 2 preds: ^bb0, ^bb2
+// CIR:    cir.return
+// CIR:  ^bb2:  // no predecessors
+// CIR:    cir.label "label"
+// CIR:    cir.br ^bb1
+
+// OGCG: define dso_local void @after_return
+// OGCG:   br label %label
+// OGCG: label:
+// OGCG:   ret void
+
+
+void after_unreachable() {
+  __builtin_unreachable();
+  label:
+}
+
+// CIR:  cir.func no_proto dso_local @after_unreachable
+// CIR:    cir.unreachable
+// CIR:  ^bb1:
+// CIR:    cir.label "label"
+// CIR:    cir.return
+
+// OGCG: define dso_local void @after_unreachable
+// OGCG:   unreachable
+// OGCG: label:
+// OGCG:   ret void
diff --git a/clang/test/CIR/CodeGen/static-vars.cpp b/clang/test/CIR/CodeGen/static-vars.cpp
index d949936..4f22fc7ab 100644
--- a/clang/test/CIR/CodeGen/static-vars.cpp
+++ b/clang/test/CIR/CodeGen/static-vars.cpp
@@ -2,6 +2,37 @@
 // RUN: FileCheck --input-file=%t.cir %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t1.ll
 // RUN: FileCheck --check-prefix=LLVM --input-file=%t1.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t1.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t1.ll %s
+
+template<typename T>
+struct implicitly_instantiated {
+  static T member;
+};
+
+template<typename T>
+T implicitly_instantiated<T>::member = 12345;
+
+int use_implicitly_instantiated() {
+  return implicitly_instantiated<int>::member;
+}
+
+// CHECK-DAG: cir.global linkonce_odr comdat @_ZN23implicitly_instantiatedIiE6memberE = #cir.int<12345> : !s32i
+// LLVM-DAG: @_ZN23implicitly_instantiatedIiE6memberE = linkonce_odr global i32 12345, comdat
+// OGCG-DAG: @_ZN23implicitly_instantiatedIiE6memberE = linkonce_odr global i32 12345, comdat
+
+template<typename T>
+struct explicitly_instantiated {
+    static T member;
+};
+
+template<typename T>
+T explicitly_instantiated<T>::member = 54321;
+
+template int explicitly_instantiated<int>::member;
+// CHECK-DAG: cir.global weak_odr comdat @_ZN23explicitly_instantiatedIiE6memberE = #cir.int<54321> : !s32i
+// LLVM-DAG: @_ZN23explicitly_instantiatedIiE6memberE = weak_odr global i32 54321, comdat
+// OGCG-DAG: @_ZN23explicitly_instantiatedIiE6memberE = weak_odr global i32 54321, comdat
 
 void func1(void) {
   // Should lower default-initialized static vars.
@@ -42,6 +73,8 @@ void func2(void) {
 
 // LLVM-DAG: $_ZZ4testvE1c = comdat any
 // LLVM-DAG: @_ZZ4testvE1c = linkonce_odr global i32 0, comdat, align 4
+// OGCG-DAG: $_ZZ4testvE1c = comdat any
+// OGCG-DAG: @_ZZ4testvE1c = linkonce_odr global i32 0, comdat, align 4
 
 inline void test() { static int c; }
 // CHECK-LABEL: @_Z4testv
diff --git a/clang/test/CIR/CodeGenOpenACC/init.c b/clang/test/CIR/CodeGenOpenACC/init.c
index 177e5a6..805fb08 100644
--- a/clang/test/CIR/CodeGenOpenACC/init.c
+++ b/clang/test/CIR/CodeGenOpenACC/init.c
@@ -11,12 +11,8 @@ void acc_init(int cond) {
   // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<star>]}
 #pragma acc init device_type(nvidia)
   // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<nvidia>]}
-#pragma acc init device_type(host, multicore)
-  // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 #pragma acc init device_type(NVIDIA)
   // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<nvidia>]}
-#pragma acc init device_type(HoSt, MuLtIcORe)
-  // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 #pragma acc init device_type(HoSt) device_type(MuLtIcORe)
   // CHECK-NEXT: acc.init attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 
diff --git a/clang/test/CIR/CodeGenOpenACC/shutdown.c b/clang/test/CIR/CodeGenOpenACC/shutdown.c
index 52db382..b68ef90 100644
--- a/clang/test/CIR/CodeGenOpenACC/shutdown.c
+++ b/clang/test/CIR/CodeGenOpenACC/shutdown.c
@@ -11,12 +11,8 @@ void acc_shutdown(int cond) {
   // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<star>]}
 #pragma acc shutdown device_type(nvidia)
   // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<nvidia>]}
-#pragma acc shutdown device_type(host, multicore)
-  // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 #pragma acc shutdown device_type(NVIDIA)
   // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<nvidia>]}
-#pragma acc shutdown device_type(HoSt, MuLtIcORe)
-  // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 #pragma acc shutdown device_type(HoSt) device_type(MuLtIcORe)
   // CHECK-NEXT: acc.shutdown attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 
diff --git a/clang/test/CIR/IR/invalid-label.cir b/clang/test/CIR/IR/invalid-label.cir
new file mode 100644
index 0000000..4cb8d01
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-label.cir
@@ -0,0 +1,12 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  // expected-error@+3 {{must be the first operation in a block}}
+  cir.func @error(){
+    %0 = cir.const #cir.int<0> : !s32i
+    cir.label "label"
+    cir.return
+  }
+}
diff --git a/clang/test/CIR/IR/label.cir b/clang/test/CIR/IR/label.cir
new file mode 100644
index 0000000..2211a4e
--- /dev/null
+++ b/clang/test/CIR/IR/label.cir
@@ -0,0 +1,26 @@
+// RUN: cir-opt %s | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @label() {
+    cir.label "label"
+    cir.return
+  }
+
+  cir.func @label2() {
+    %0 = cir.const #cir.int<0> : !s32i
+    cir.br ^bb1
+  ^bb1:  // pred: ^bb0
+    cir.label "label2"
+    cir.return
+  }
+}
+
+// CHECK:       cir.func @label
+// CHECK-NEXT:    cir.label "label"
+
+// CHECK:       cir.func @label2
+// CHECK:        cir.br ^bb1
+// CHECK-NEXT:  ^bb1:  // pred: ^bb0
+// CHECK-NEXT:    cir.label "label2"
diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp
index 438d41e..82d803a 100644
--- a/clang/test/CIR/Lowering/array.cpp
+++ b/clang/test/CIR/Lowering/array.cpp
@@ -57,17 +57,28 @@ void func() {
 void func2() {
   int arr[2] = {5};
 }
+
 // CHECK: define{{.*}} void @_Z5func2v()
-// CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4
-// CHECK:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// CHECK:  %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0
-// CHECK:  store i32 5, ptr %[[ARR_PTR]], align 4
-// CHECK:  %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
-// CHECK:  store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
-// CHECK:  %[[TMP2:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:  store i32 0, ptr %[[TMP2]], align 4
-// CHECK:  %[[ELE_1:.*]] = getelementptr i32, ptr %[[TMP2]], i64 1
-// CHECK:  store ptr %[[ELE_1]], ptr %[[TMP]], align 8
+// CHECK:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
+// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// CHECK:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
+// CHECK:   store i32 5, ptr %[[ARR_PTR]], align 4
+// CHECK:   %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1
+// CHECK:   store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8
+// CHECK:   %[[END_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 2
+// CHECK:   br label %[[LOOP_BODY:.*]]
+// CHECK: [[LOOP_NEXT:.*]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// CHECK: [[LOOP_BODY]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   store i32 0, ptr %[[CUR]], align 4
+// CHECK:   %[[NEXT:.*]] = getelementptr i32, ptr %[[CUR]], i64 1
+// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// CHECK:   br label %[[LOOP_NEXT:.*]]
+// CHECK: [[LOOP_END]]:
+// CHECK:   ret void
 
 void func3() {
   int arr3[2] = {5, 6};
@@ -103,17 +114,27 @@ void func5() {
   int arr[2][1] = {{5}};
 }
 // CHECK: define{{.*}} void @_Z5func5v()
-// CHECK:  %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
-// CHECK:  %[[TMP:.*]] = alloca ptr, i64 1, align 8
-// CHECK:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0
-// CHECK:  %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
-// CHECK:  store i32 5, ptr %[[ARR_0]], align 4
-// CHECK:  %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
-// CHECK:  store ptr %[[ARR_1]], ptr %[[TMP]], align 8
-// CHECK:  %[[ARR_1_VAL:.*]] = load ptr, ptr %[[TMP]], align 8
-// CHECK:  store [1 x i32] zeroinitializer, ptr %[[ARR_1_VAL]], align 4
-// CHECK:  %[[ARR_1_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_1_VAL]], i64 1
-// CHECK:  store ptr %[[ARR_1_PTR]], ptr %[[TMP]], align 8
+// CHECK:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
+// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// CHECK:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
+// CHECK:   %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// CHECK:   store i32 5, ptr %[[ARR_0]], align 4
+// CHECK:   %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1
+// CHECK:   store ptr %[[ARR_1]], ptr %[[TMP]], align 8
+// CHECK:   %[[END_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 2
+// CHECK:   br label %[[LOOP_BODY:.*]]
+// CHECK: [[LOOP_NEXT:.*]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// CHECK: [[LOOP_BODY]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   store [1 x i32] zeroinitializer, ptr %[[CUR]], align 4
+// CHECK:   %[[NEXT:.*]] = getelementptr [1 x i32], ptr %[[CUR]], i64 1
+// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// CHECK:   br label %[[LOOP_NEXT:.*]]
+// CHECK: [[LOOP_END]]:
+// CHECK:   ret void
 
 void func6() {
   int x = 4;
@@ -133,14 +154,24 @@ void func7() {
   int* arr[1] = {};
 }
 // CHECK: define{{.*}} void @_Z5func7v()
-// CHECK:  %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
-// CHECK:  %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
-// CHECK:  %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
-// CHECK:  store ptr %[[ELE_PTR]], ptr %[[ALLOCA]], align 8
-// CHECK:  %[[TMP:.*]] = load ptr, ptr %[[ALLOCA]], align 8
-// CHECK:  store ptr null, ptr %[[TMP]], align 8
-// CHECK:  %[[ELE:.*]] = getelementptr ptr, ptr %[[TMP]], i64 1
-// CHECK:  store ptr %[[ELE]], ptr %[[ALLOCA]], align 8
+// CHECK:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
+// CHECK:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
+// CHECK:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
+// CHECK:   store ptr %[[ARR_PTR]], ptr %[[TMP]], align 8
+// CHECK:   %[[END_PTR:.*]] = getelementptr ptr, ptr %[[ARR_PTR]], i64 1
+// CHECK:   br label %[[LOOP_BODY:.*]]
+// CHECK: [[LOOP_NEXT:.*]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[END_PTR]]
+// CHECK:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// CHECK: [[LOOP_BODY]]:
+// CHECK:   %[[CUR:.*]] = load ptr, ptr %[[TMP]], align 8
+// CHECK:   store ptr null, ptr %[[CUR]], align 8
+// CHECK:   %[[NEXT:.*]] = getelementptr ptr, ptr %[[CUR]], i64 1
+// CHECK:   store ptr %[[NEXT]], ptr %[[TMP]], align 8
+// CHECK:   br label %[[LOOP_NEXT:.*]]
+// CHECK: [[LOOP_END]]:
+// CHECK:   ret void
 
 void func8(int p[10]) {}
 // CHECK: define{{.*}} void @_Z5func8Pi(ptr {{%.*}})
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 28cad00..e0a21c2 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -46,14 +46,14 @@ __m256d test_mm256_and_pd(__m256d A, __m256d B) {
   // CHECK: and <4 x i64>
   return _mm256_and_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_and_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){+0.0, -0.0, -0.0, +7.0}), -0.0, -0.0, +0.0, +7.0));
+TEST_CONSTEXPR(match_m256d(_mm256_and_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){+0.0, -0.0, -0.0, +7.0}), +0.0, -0.0, +0.0, +7.0));
 
 __m256 test_mm256_and_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_and_ps
   // CHECK: and <8 x i32>
   return _mm256_and_ps(A, B);
 }
-TEST_CONSTEXPR(match_m256(_mm256_and_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f));
+TEST_CONSTEXPR(match_m256(_mm256_and_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, +0.0f));
 
 __m256d test_mm256_andnot_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_andnot_pd
@@ -61,7 +61,7 @@ __m256d test_mm256_andnot_pd(__m256d A, __m256d B) {
   // CHECK: and <4 x i64>
   return _mm256_andnot_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_andnot_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){+0.0, -0.0, -0.0, +7.0}), +0.0, +0.0, +0.0, +0.0));
+TEST_CONSTEXPR(match_m256d(_mm256_andnot_pd((__m256d){-4.0, -5.0, +6.0, +7.0}, (__m256d){+0.0, -0.0, -0.0, +7.0}), +0.0, +0.0, -0.0, +0.0));
 
 __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_andnot_ps
@@ -69,7 +69,7 @@ __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
   // CHECK: and <8 x i32>
   return _mm256_andnot_ps(A, B);
 }
-TEST_CONSTEXPR(match_m256(_mm256_andnot_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f));
+TEST_CONSTEXPR(match_m256(_mm256_andnot_ps((__m256){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m256){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, -0.0f, +0.0f, +0.0f, -0.0f, +0.0f, +0.0f));
 
 __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_blend_pd
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 76d26d5..e7f8106 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -160,6 +160,7 @@ __m128i test_mm_broadcastb_epi8(__m128i a) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
   return _mm_broadcastb_epi8(a);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_broadcastb_epi8((__m128i)(__v16qi){42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m256i test_mm256_broadcastb_epi8(__m128i a) {
   // CHECK-LABEL: test_mm256_broadcastb_epi8
@@ -167,6 +168,7 @@ __m256i test_mm256_broadcastb_epi8(__m128i a) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
   return _mm256_broadcastb_epi8(a);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_broadcastb_epi8((__m128i)(__v16qi){42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m128i test_mm_broadcastd_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_broadcastd_epi32
@@ -174,6 +176,7 @@ __m128i test_mm_broadcastd_epi32(__m128i a) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_broadcastd_epi32(a);
 }
+TEST_CONSTEXPR(match_v4si(_mm_broadcastd_epi32((__m128i)(__v4si){-42, 0, 0, 0}), -42, -42, -42, -42));
 
 __m256i test_mm256_broadcastd_epi32(__m128i a) {
   // CHECK-LABEL: test_mm256_broadcastd_epi32
@@ -181,6 +184,7 @@ __m256i test_mm256_broadcastd_epi32(__m128i a) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_broadcastd_epi32(a);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_broadcastd_epi32((__m128i)(__v4si){-42, 0, 0, 0}), -42, -42, -42, -42, -42, -42, -42, -42));
 
 __m128i test_mm_broadcastq_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_broadcastq_epi64
@@ -188,6 +192,7 @@ __m128i test_mm_broadcastq_epi64(__m128i a) {
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
   return _mm_broadcastq_epi64(a);
 }
+TEST_CONSTEXPR(match_v2di(_mm_broadcastq_epi64((__m128i)(__v2di){-42, 0}), -42, -42));
 
 __m256i test_mm256_broadcastq_epi64(__m128i a) {
   // CHECK-LABEL: test_mm256_broadcastq_epi64
@@ -195,12 +200,14 @@ __m256i test_mm256_broadcastq_epi64(__m128i a) {
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
   return _mm256_broadcastq_epi64(a);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_broadcastq_epi64((__m128i)(__v2di){-42, 0}), -42, -42, -42, -42));
 
 __m128d test_mm_broadcastsd_pd(__m128d a) {
   // CHECK-LABEL: test_mm_broadcastsd_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
   return _mm_broadcastsd_pd(a);
 }
+TEST_CONSTEXPR(match_m128d(_mm_broadcastsd_pd((__m128d){+7.0, -7.0}), +7.0, +7.0));
 
 __m256d test_mm256_broadcastsd_pd(__m128d a) {
   // CHECK-LABEL: test_mm256_broadcastsd_pd
@@ -208,12 +215,14 @@ __m256d test_mm256_broadcastsd_pd(__m128d a) {
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
   return _mm256_broadcastsd_pd(a);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_broadcastsd_pd((__m128d){+7.0, -7.0}), +7.0, +7.0, +7.0, +7.0));
 
 __m256i test_mm256_broadcastsi128_si256(__m128i a) {
   // CHECK-LABEL: test_mm256_broadcastsi128_si256
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcastsi128_si256(a);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_broadcastsi128_si256((__m128i)(__v2di){3, 45}), 3, 45, 3, 45));
 
 __m256i test_mm_broadcastsi128_si256(__m128i a) {
   // CHECK-LABEL: test_mm_broadcastsi128_si256
@@ -227,6 +236,7 @@ __m128 test_mm_broadcastss_ps(__m128 a) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_broadcastss_ps(a);
 }
+TEST_CONSTEXPR(match_m128(_mm_broadcastss_ps((__m128){-4.0f, +5.0f, +6.0f, +7.0f}), -4.0f, -4.0f, -4.0f, -4.0f));
 
 __m256 test_mm256_broadcastss_ps(__m128 a) {
   // CHECK-LABEL: test_mm256_broadcastss_ps
@@ -234,6 +244,7 @@ __m256 test_mm256_broadcastss_ps(__m128 a) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_broadcastss_ps(a);
 }
+TEST_CONSTEXPR(match_m256(_mm256_broadcastss_ps((__m128){-4.0f, +5.0f, +6.0f, +7.0f}), -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f, -4.0f));
 
 __m128i test_mm_broadcastw_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_broadcastw_epi16
@@ -241,6 +252,7 @@ __m128i test_mm_broadcastw_epi16(__m128i a) {
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
   return _mm_broadcastw_epi16(a);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_broadcastw_epi16((__m128i)(__v8hi){42, 0, 0, 0, 0, 0, 0, 0}), 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m256i test_mm256_broadcastw_epi16(__m128i a) {
   // CHECK-LABEL: test_mm256_broadcastw_epi16
@@ -248,6 +260,7 @@ __m256i test_mm256_broadcastw_epi16(__m128i a) {
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
   return _mm256_broadcastw_epi16(a);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_broadcastw_epi16((__m128i)(__v8hi){42, 0, 0, 0, 0, 0, 0, 0}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m256i test_mm256_bslli_epi128(__m256i a) {
   // CHECK-LABEL: test_mm256_bslli_epi128
@@ -1327,48 +1340,56 @@ __m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   return _mm256_unpackhi_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_unpackhi_epi8((__m256i)(__v32qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m256i)(__v32qi){32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}), 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63));
 
 __m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpackhi_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   return _mm256_unpackhi_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_unpackhi_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m256i)(__v16hi){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31));
 
 __m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpackhi_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   return _mm256_unpackhi_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_unpackhi_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m256i)(__v8si){8, 9, 10, 11, 12, 13, 14, 15}), 2, 10, 3, 11, 6, 14, 7, 15));
 
 __m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpackhi_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   return _mm256_unpackhi_epi64(a, b);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_unpackhi_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m256i)(__v4di){ 4, 5, 6, 7}), 1, 5, 3, 7));
 
 __m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpacklo_epi8
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   return _mm256_unpacklo_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_unpacklo_epi8((__m256i)(__v32qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m256i)(__v32qi){32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}), 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55));
 
 __m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpacklo_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   return _mm256_unpacklo_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_unpacklo_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m256i)(__v16hi){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27));
 
 __m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpacklo_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   return _mm256_unpacklo_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_unpacklo_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m256i)(__v8si){ 8, 9, 10, 11, 12, 13, 14, 15}), 0, 8, 1, 9, 4, 12, 5, 13));
 
 __m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_unpacklo_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   return _mm256_unpacklo_epi64(a, b);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_unpacklo_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m256i)(__v4di){ 4, 5, 6, 7}), 0, 4, 2, 6));
 
 __m256i test_mm256_xor_si256(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_xor_si256
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 67f7df0..0bd9718 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1,26 +1,28 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
 
 
 #include <immintrin.h>
 #include "builtin_test_helpers.h"
 
 __mmask32 test_knot_mask32(__mmask32 a) {
-  // CHECK-LABEL: @test_knot_mask32
+  // CHECK-LABEL: test_knot_mask32
   // CHECK: [[IN:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[NOT:%.*]] = xor <32 x i1> [[IN]], splat (i1 true)
   return _knot_mask32(a);
 }
 
 __mmask64 test_knot_mask64(__mmask64 a) {
-  // CHECK-LABEL: @test_knot_mask64
+  // CHECK-LABEL: test_knot_mask64
   // CHECK: [[IN:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[NOT:%.*]] = xor <64 x i1> [[IN]], splat (i1 true)
   return _knot_mask64(a);
 }
 
 __mmask32 test_kand_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kand_mask32
+  // CHECK-LABEL: test_kand_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = and <32 x i1> [[LHS]], [[RHS]]
@@ -30,7 +32,7 @@ __mmask32 test_kand_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask64 test_kand_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kand_mask64
+  // CHECK-LABEL: test_kand_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = and <64 x i1> [[LHS]], [[RHS]]
@@ -40,7 +42,7 @@ __mmask64 test_kand_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask32 test_kandn_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kandn_mask32
+  // CHECK-LABEL: test_kandn_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[NOT:%.*]] = xor <32 x i1> [[LHS]], splat (i1 true)
@@ -51,7 +53,7 @@ __mmask32 test_kandn_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
 }
 
 __mmask64 test_kandn_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kandn_mask64
+  // CHECK-LABEL: test_kandn_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[NOT:%.*]] = xor <64 x i1> [[LHS]], splat (i1 true)
@@ -62,7 +64,7 @@ __mmask64 test_kandn_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
 }
 
 __mmask32 test_kor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kor_mask32
+  // CHECK-LABEL: test_kor_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = or <32 x i1> [[LHS]], [[RHS]]
@@ -72,7 +74,7 @@ __mmask32 test_kor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __
 }
 
 __mmask64 test_kor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kor_mask64
+  // CHECK-LABEL: test_kor_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = or <64 x i1> [[LHS]], [[RHS]]
@@ -82,7 +84,7 @@ __mmask64 test_kor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __
 }
 
 __mmask32 test_kxnor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxnor_mask32
+  // CHECK-LABEL: test_kxnor_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[NOT:%.*]] = xor <32 x i1> [[LHS]], splat (i1 true)
@@ -93,7 +95,7 @@ __mmask32 test_kxnor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
 }
 
 __mmask64 test_kxnor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxnor_mask64
+  // CHECK-LABEL: test_kxnor_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[NOT:%.*]] = xor <64 x i1> [[LHS]], splat (i1 true)
@@ -104,7 +106,7 @@ __mmask64 test_kxnor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
 }
 
 __mmask32 test_kxor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxor_mask32
+  // CHECK-LABEL: test_kxor_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = xor <32 x i1> [[LHS]], [[RHS]]
@@ -114,7 +116,7 @@ __mmask32 test_kxor_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask64 test_kxor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxor_mask64
+  // CHECK-LABEL: test_kxor_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = xor <64 x i1> [[LHS]], [[RHS]]
@@ -124,7 +126,7 @@ __mmask64 test_kxor_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 unsigned char test_kortestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestz_mask32_u8
+  // CHECK-LABEL: test_kortestz_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[OR:%.*]] = or <32 x i1> [[LHS]], [[RHS]]
@@ -137,7 +139,7 @@ unsigned char test_kortestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
 }
 
 unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestc_mask32_u8
+  // CHECK-LABEL: test_kortestc_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[OR:%.*]] = or <32 x i1> [[LHS]], [[RHS]]
@@ -150,7 +152,7 @@ unsigned char test_kortestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m
 }
 
 unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_kortest_mask32_u8
+  // CHECK-LABEL: test_kortest_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[OR:%.*]] = or <32 x i1> [[LHS]], [[RHS]]
@@ -170,7 +172,7 @@ unsigned char test_kortest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m5
 }
 
 unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestz_mask64_u8
+  // CHECK-LABEL: test_kortestz_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[OR:%.*]] = or <64 x i1> [[LHS]], [[RHS]]
@@ -183,7 +185,7 @@ unsigned char test_kortestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
 }
 
 unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestc_mask64_u8
+  // CHECK-LABEL: test_kortestc_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[OR:%.*]] = or <64 x i1> [[LHS]], [[RHS]]
@@ -196,7 +198,7 @@ unsigned char test_kortestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m
 }
 
 unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_kortest_mask64_u8
+  // CHECK-LABEL: test_kortest_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[OR:%.*]] = or <64 x i1> [[LHS]], [[RHS]]
@@ -216,7 +218,7 @@ unsigned char test_kortest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m5
 }
 
 unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestz_mask32_u8
+  // CHECK-LABEL: test_ktestz_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestz.d(<32 x i1> [[LHS]], <32 x i1> [[RHS]])
@@ -226,7 +228,7 @@ unsigned char test_ktestz_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestc_mask32_u8
+  // CHECK-LABEL: test_ktestc_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.d(<32 x i1> [[LHS]], <32 x i1> [[RHS]])
@@ -236,7 +238,7 @@ unsigned char test_ktestc_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_ktest_mask32_u8
+  // CHECK-LABEL: test_ktest_mask32_u8
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.d(<32 x i1> [[LHS]], <32 x i1> [[RHS]])
@@ -250,7 +252,7 @@ unsigned char test_ktest_mask32_u8(__m512i __A, __m512i __B, __m512i __C, __m512
 }
 
 unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestz_mask64_u8
+  // CHECK-LABEL: test_ktestz_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestz.q(<64 x i1> [[LHS]], <64 x i1> [[RHS]])
@@ -260,7 +262,7 @@ unsigned char test_ktestz_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_ktestc_mask64_u8
+  // CHECK-LABEL: test_ktestc_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.q(<64 x i1> [[LHS]], <64 x i1> [[RHS]])
@@ -270,7 +272,7 @@ unsigned char test_ktestc_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m51
 }
 
 unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_ktest_mask64_u8
+  // CHECK-LABEL: test_ktest_mask64_u8
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = call i32 @llvm.x86.avx512.ktestc.q(<64 x i1> [[LHS]], <64 x i1> [[RHS]])
@@ -284,7 +286,7 @@ unsigned char test_ktest_mask64_u8(__m512i __A, __m512i __B, __m512i __C, __m512
 }
 
 __mmask32 test_kadd_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kadd_mask32
+  // CHECK-LABEL: test_kadd_mask32
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = call <32 x i1> @llvm.x86.avx512.kadd.d(<32 x i1> [[LHS]], <32 x i1> [[RHS]])
@@ -294,7 +296,7 @@ __mmask32 test_kadd_mask32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask64 test_kadd_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kadd_mask64
+  // CHECK-LABEL: test_kadd_mask64
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = call <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1> [[LHS]], <64 x i1> [[RHS]])
@@ -304,702 +306,702 @@ __mmask64 test_kadd_mask64(__m512i __A, __m512i __B, __m512i __C, __m512i __D, _
 }
 
 __mmask32 test_kshiftli_mask32(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftli_mask32
+  // CHECK-LABEL: test_kshiftli_mask32
   // CHECK: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <32 x i1> zeroinitializer, <32 x i1> [[VAL]], <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32>
   return _mm512_mask_cmpneq_epu16_mask(_kshiftli_mask32(_mm512_cmpneq_epu16_mask(A, B), 31), C, D);
 }
 
 __mmask32 test_kshiftri_mask32(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftri_mask32
+  // CHECK-LABEL: test_kshiftri_mask32
   // CHECK: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <32 x i1> [[VAL]], <32 x i1> zeroinitializer, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
   return _mm512_mask_cmpneq_epu16_mask(_kshiftri_mask32(_mm512_cmpneq_epu16_mask(A, B), 31), C, D);
 }
 
 __mmask64 test_kshiftli_mask64(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftli_mask64
+  // CHECK-LABEL: test_kshiftli_mask64
   // CHECK: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <64 x i1> zeroinitializer, <64 x i1> [[VAL]], <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
   return _mm512_mask_cmpneq_epu8_mask(_kshiftli_mask64(_mm512_cmpneq_epu8_mask(A, B), 32), C, D);
 }
 
 __mmask64 test_kshiftri_mask64(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftri_mask64
+  // CHECK-LABEL: test_kshiftri_mask64
   // CHECK: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RES:%.*]] = shufflevector <64 x i1> [[VAL]], <64 x i1> zeroinitializer, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
   return _mm512_mask_cmpneq_epu8_mask(_kshiftri_mask64(_mm512_cmpneq_epu8_mask(A, B), 32), C, D);
 }
 
 unsigned int test_cvtmask32_u32(__m512i A, __m512i B) {
-  // CHECK-LABEL: @test_cvtmask32_u32
+  // CHECK-LABEL: test_cvtmask32_u32
   return _cvtmask32_u32(_mm512_cmpneq_epu16_mask(A, B));
 }
 
 unsigned long long test_cvtmask64_u64(__m512i A, __m512i B) {
-  // CHECK-LABEL: @test_cvtmask64_u64
+  // CHECK-LABEL: test_cvtmask64_u64
   return _cvtmask64_u64(_mm512_cmpneq_epu8_mask(A, B));
 }
 
 __mmask32 test_cvtu32_mask32(__m512i A, __m512i B, unsigned int C) {
-  // CHECK-LABEL: @test_cvtu32_mask32
+  // CHECK-LABEL: test_cvtu32_mask32
   return _mm512_mask_cmpneq_epu16_mask(_cvtu32_mask32(C), A, B);
 }
 
 __mmask64 test_cvtu64_mask64(__m512i A, __m512i B, unsigned long long C) {
-  // CHECK-LABEL: @test_cvtu64_mask64
+  // CHECK-LABEL: test_cvtu64_mask64
   return _mm512_mask_cmpneq_epu8_mask(_cvtu64_mask64(C), A, B);
 }
 
 __mmask32 test_load_mask32(__mmask32 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_load_mask32
+  // CHECK-LABEL: test_load_mask32
   // CHECK: [[LOAD:%.*]] = load i32, ptr %{{.*}}
   return _mm512_mask_cmpneq_epu16_mask(_load_mask32(A), B, C);
 }
 
 __mmask64 test_load_mask64(__mmask64 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_load_mask64
+  // CHECK-LABEL: test_load_mask64
   // CHECK: [[LOAD:%.*]] = load i64, ptr %{{.*}}
   return _mm512_mask_cmpneq_epu8_mask(_load_mask64(A), B, C);
 }
 
 void test_store_mask32(__mmask32 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_store_mask32
+  // CHECK-LABEL: test_store_mask32
   // CHECK: store i32 %{{.*}}, ptr %{{.*}}
   _store_mask32(A, _mm512_cmpneq_epu16_mask(B, C));
 }
 
 void test_store_mask64(__mmask64 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_store_mask64
+  // CHECK-LABEL: test_store_mask64
   // CHECK: store i64 %{{.*}}, ptr %{{.*}}
   _store_mask64(A, _mm512_cmpneq_epu8_mask(B, C));
 }
 
 __mmask64 test_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epi8_mask
+  // CHECK-LABEL: test_mm512_cmpeq_epi8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpeq_epi8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epi16_mask
+  // CHECK-LABEL: test_mm512_cmpeq_epi16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpeq_epi16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epi8_mask
+  // CHECK-LABEL: test_mm512_cmpgt_epi8_mask
   // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpgt_epi8_mask
   // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epi16_mask
+  // CHECK-LABEL: test_mm512_cmpgt_epi16_mask
   // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpgt_epi16_mask
   // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epu8_mask
+  // CHECK-LABEL: test_mm512_cmpeq_epu8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpeq_epu8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epu16_mask
+  // CHECK-LABEL: test_mm512_cmpeq_epu16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpeq_epu16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epu8_mask
+  // CHECK-LABEL: test_mm512_cmpgt_epu8_mask
   // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpgt_epu8_mask
   // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epu16_mask
+  // CHECK-LABEL: test_mm512_cmpgt_epu16_mask
   // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpgt_epu16_mask
   // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epi8_mask
+  // CHECK-LABEL: test_mm512_cmpge_epi8_mask
   // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpge_epi8_mask
   // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epu8_mask
+  // CHECK-LABEL: test_mm512_cmpge_epu8_mask
   // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpge_epu8_mask
   // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epi16_mask
+  // CHECK-LABEL: test_mm512_cmpge_epi16_mask
   // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpge_epi16_mask
   // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epu16_mask
+  // CHECK-LABEL: test_mm512_cmpge_epu16_mask
   // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpge_epu16_mask
   // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epi8_mask
+  // CHECK-LABEL: test_mm512_cmple_epi8_mask
   // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmple_epi8_mask
   // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epu8_mask
+  // CHECK-LABEL: test_mm512_cmple_epu8_mask
   // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmple_epu8_mask
   // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epi16_mask
+  // CHECK-LABEL: test_mm512_cmple_epi16_mask
   // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmple_epi16_mask
   // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epu16_mask
+  // CHECK-LABEL: test_mm512_cmple_epu16_mask
   // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmple_epu16_mask
   // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epi8_mask
+  // CHECK-LABEL: test_mm512_cmplt_epi8_mask
   // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmplt_epi8_mask
   // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epu8_mask
+  // CHECK-LABEL: test_mm512_cmplt_epu8_mask
   // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmplt_epu8_mask
   // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epi16_mask
+  // CHECK-LABEL: test_mm512_cmplt_epi16_mask
   // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmplt_epi16_mask
   // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epu16_mask
+  // CHECK-LABEL: test_mm512_cmplt_epu16_mask
   // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmplt_epu16_mask
   // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epi8_mask
+  // CHECK-LABEL: test_mm512_cmpneq_epi8_mask
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpneq_epi8_mask
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epu8_mask
+  // CHECK-LABEL: test_mm512_cmpneq_epu8_mask
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmpneq_epu8_mask
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epi16_mask
+  // CHECK-LABEL: test_mm512_cmpneq_epi16_mask
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpneq_epi16_mask
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epu16_mask
+  // CHECK-LABEL: test_mm512_cmpneq_epu16_mask
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmpneq_epu16_mask
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmp_epi8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi8_mask
+  // CHECK-LABEL: test_mm512_cmp_epi8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_cmp_epi8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask64 test_mm512_cmp_epu8_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epu8_mask
+  // CHECK-LABEL: test_mm512_cmp_epu8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epu8_mask
+  // CHECK-LABEL: test_mm512_mask_cmp_epu8_mask
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epi16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epi16_mask
+  // CHECK-LABEL: test_mm512_cmp_epi16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_cmp_epi16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epu16_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epu16_mask
+  // CHECK-LABEL: test_mm512_cmp_epu16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epu16_mask
+  // CHECK-LABEL: test_mm512_mask_cmp_epu16_mask
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 __m512i test_mm512_add_epi8 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_add_epi8
+  //CHECK-LABEL: test_mm512_add_epi8
   //CHECK: add <64 x i8>
   return _mm512_add_epi8(__A,__B);
 }
 
 __m512i test_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mask_add_epi8
+  //CHECK-LABEL: test_mm512_mask_add_epi8
   //CHECK: add <64 x i8> %{{.*}}, %{{.*}}
   //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_add_epi8(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_add_epi8
+  //CHECK-LABEL: test_mm512_maskz_add_epi8
   //CHECK: add <64 x i8> %{{.*}}, %{{.*}}
   //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_add_epi8(__U, __A, __B);
 }
 
 __m512i test_mm512_sub_epi8 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_sub_epi8
+  //CHECK-LABEL: test_mm512_sub_epi8
   //CHECK: sub <64 x i8>
   return _mm512_sub_epi8(__A, __B);
 }
 
 __m512i test_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mask_sub_epi8
+  //CHECK-LABEL: test_mm512_mask_sub_epi8
   //CHECK: sub <64 x i8> %{{.*}}, %{{.*}}
   //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_sub_epi8(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_sub_epi8
+  //CHECK-LABEL: test_mm512_maskz_sub_epi8
   //CHECK: sub <64 x i8> %{{.*}}, %{{.*}}
   //CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_sub_epi8(__U, __A, __B);
 }
 
 __m512i test_mm512_add_epi16 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_add_epi16
+  //CHECK-LABEL: test_mm512_add_epi16
   //CHECK: add <32 x i16>
   return _mm512_add_epi16(__A, __B);
 }
 
 __m512i test_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mask_add_epi16
+  //CHECK-LABEL: test_mm512_mask_add_epi16
   //CHECK: add <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_add_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_add_epi16
+  //CHECK-LABEL: test_mm512_maskz_add_epi16
   //CHECK: add <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_add_epi16(__U, __A, __B);
 }
 
 __m512i test_mm512_sub_epi16 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_sub_epi16
+  //CHECK-LABEL: test_mm512_sub_epi16
   //CHECK: sub <32 x i16>
   return _mm512_sub_epi16(__A, __B);
 }
 
 __m512i test_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mask_sub_epi16
+  //CHECK-LABEL: test_mm512_mask_sub_epi16
   //CHECK: sub <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sub_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_sub_epi16
+  //CHECK-LABEL: test_mm512_maskz_sub_epi16
   //CHECK: sub <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sub_epi16(__U, __A, __B);
 }
 
 __m512i test_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mullo_epi16
+  //CHECK-LABEL: test_mm512_mullo_epi16
   //CHECK: mul <32 x i16>
   return _mm512_mullo_epi16(__A, __B);
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_mullo_epi16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -64, 124, 180, -232, -280, 324, 364, -400, -432, 460, 484, -504, -520, 532, 540, -544, -544, 540, 532, -520, -504, 484, 460, -432, -400, 364, 324, -280, -232, -180, -124, -64));
 
 __m512i test_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mask_mullo_epi16
+  //CHECK-LABEL: test_mm512_mask_mullo_epi16
   //CHECK: mul <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mullo_epi16(__W, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_mullo_epi16
+  //CHECK-LABEL: test_mm512_maskz_mullo_epi16
   //CHECK: mul <32 x i16> %{{.*}}, %{{.*}}
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mullo_epi16(__U, __A, __B);
 }
 
 __m512i test_mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) {
-  // CHECK-LABEL: @test_mm512_mask_blend_epi8
+  // CHECK-LABEL: test_mm512_mask_blend_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_blend_epi8(__U,__A,__W); 
 }
 __m512i test_mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) {
-  // CHECK-LABEL: @test_mm512_mask_blend_epi16
+  // CHECK-LABEL: test_mm512_mask_blend_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_blend_epi16(__U,__A,__W); 
 }
 __m512i test_mm512_abs_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_abs_epi8
+  // CHECK-LABEL: test_mm512_abs_epi8
   // CHECK: [[ABS:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %{{.*}}, i1 false)
   return _mm512_abs_epi8(__A); 
 }
 __m512i test_mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_abs_epi8
+  // CHECK-LABEL: test_mm512_mask_abs_epi8
   // CHECK: [[ABS:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %{{.*}}, i1 false)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> [[ABS]], <64 x i8> %{{.*}}
   return _mm512_mask_abs_epi8(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_abs_epi8
+  // CHECK-LABEL: test_mm512_maskz_abs_epi8
   // CHECK: [[ABS:%.*]] = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %{{.*}}, i1 false)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> [[ABS]], <64 x i8> %{{.*}}
   return _mm512_maskz_abs_epi8(__U,__A); 
 }
 __m512i test_mm512_abs_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_abs_epi16
+  // CHECK-LABEL: test_mm512_abs_epi16
   // CHECK: [[ABS:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %{{.*}}, i1 false)
   return _mm512_abs_epi16(__A); 
 }
 __m512i test_mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_abs_epi16
+  // CHECK-LABEL: test_mm512_mask_abs_epi16
   // CHECK: [[ABS:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %{{.*}}, i1 false)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> [[ABS]], <32 x i16> %{{.*}}
   return _mm512_mask_abs_epi16(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_abs_epi16
+  // CHECK-LABEL: test_mm512_maskz_abs_epi16
   // CHECK: [[ABS:%.*]] = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %{{.*}}, i1 false)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> [[ABS]], <32 x i16> %{{.*}}
   return _mm512_maskz_abs_epi16(__U,__A); 
 }
 __m512i test_mm512_packs_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_packs_epi32
+  // CHECK-LABEL: test_mm512_packs_epi32
   // CHECK: @llvm.x86.avx512.packssdw.512
   return _mm512_packs_epi32(__A,__B); 
 }
 __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_packs_epi32
+  // CHECK-LABEL: test_mm512_maskz_packs_epi32
   // CHECK: @llvm.x86.avx512.packssdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packs_epi32(__M,__A,__B); 
 }
 __m512i test_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_packs_epi32
+  // CHECK-LABEL: test_mm512_mask_packs_epi32
   // CHECK: @llvm.x86.avx512.packssdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packs_epi32(__W,__M,__A,__B); 
 }
 __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_packs_epi16
+  // CHECK-LABEL: test_mm512_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
   return _mm512_packs_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_packs_epi16
+  // CHECK-LABEL: test_mm512_mask_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packs_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_packs_epi16
+  // CHECK-LABEL: test_mm512_maskz_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packs_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_packus_epi32
+  // CHECK-LABEL: test_mm512_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   return _mm512_packus_epi32(__A,__B); 
 }
 __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_packus_epi32
+  // CHECK-LABEL: test_mm512_maskz_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packus_epi32(__M,__A,__B); 
 }
 __m512i test_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_packus_epi32
+  // CHECK-LABEL: test_mm512_mask_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packus_epi32(__W,__M,__A,__B); 
 }
 __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_packus_epi16
+  // CHECK-LABEL: test_mm512_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   return _mm512_packus_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_packus_epi16
+  // CHECK-LABEL: test_mm512_mask_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packus_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_packus_epi16
+  // CHECK-LABEL: test_mm512_maskz_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packus_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_adds_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_adds_epi8
+  // CHECK-LABEL: test_mm512_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
   return _mm512_adds_epi8(__A,__B); 
 }
-TEST_CONSTEXPR(match_v64qi(_mm512_adds_epi8((__m512i)(__v64qi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +100, +50, -100, +20, +80, -50, +120, -20, -100, -50, +100, -20, -80, +50, -120, +20}, (__m512i)(__v64qi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +50, +80, -50, +110, +60, -30, +20, -10, +50, +80, -50, +110, +60, -30, +20, -10}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, -56, +58, -60, +62, -64, +66, -68, +70, -72, +74, -76, +78, -80, +82, -84, +86, -88, +90, -92, +94, +127, +127, -128, +127, +127, -80, +127, -30, -50, +30, +50, +90, -20, +20, -100, +10));
+TEST_CONSTEXPR(match_v64qi(_mm512_adds_epi8((__m512i)(__v64qs){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +100, +50, -100, +20, +80, -50, +120, -20, -100, -50, +100, -20, -80, +50, -120, +20}, (__m512i)(__v64qs){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +50, +80, -50, +110, +60, -30, +20, -10, +50, +80, -50, +110, +60, -30, +20, -10}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, -56, +58, -60, +62, -64, +66, -68, +70, -72, +74, -76, +78, -80, +82, -84, +86, -88, +90, -92, +94, +127, +127, -128, +127, +127, -80, +127, -30, -50, +30, +50, +90, -20, +20, -100, +10));
 
 __m512i test_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_adds_epi8
+  // CHECK-LABEL: test_mm512_mask_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
  return _mm512_mask_adds_epi8(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_adds_epi8
+  // CHECK-LABEL: test_mm512_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epi8(__U,__A,__B); 
 }
 __m512i test_mm512_adds_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_adds_epi16
+  // CHECK-LABEL: test_mm512_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
  return _mm512_adds_epi16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +800, -800, -800, +800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200));
 
 __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_adds_epi16
+  // CHECK-LABEL: test_mm512_mask_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_adds_epi16
+  // CHECK-LABEL: test_mm512_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_adds_epi16(__U,__A,__B); 
 }
 __m512i test_mm512_adds_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_adds_epu8
+  // CHECK-LABEL: test_mm512_adds_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
   // CHECK: call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_adds_epu8(__A,__B); 
 }
-TEST_CONSTEXPR(match_v64qu(_mm512_adds_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, +63, +64, +127, +128, +191, +192, +255, +63, +126, +127, +190, +191, +254, +255, +255, +64, +127, +128, +192, +193, +255, +255, +255, +127, +190, +191, +254, +255, +255, +255, +255, +128, +191, +192, +255, +255, +255, +255, +255, +191, +254, +255, +255, +255, +255, +255, +255, +192, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255));
+TEST_CONSTEXPR(match_v64qu(_mm512_adds_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, +63, +64, +127, +128, +191, +192, +255, +63, +126, +127, +190, +191, +254, +255, +255, +64, +127, +128, +191, +192, +255, +255, +255, +127, +190, +191, +254, +255, +255, +255, +255, +128, +191, +192, +255, +255, +255, +255, +255, +191, +254, +255, +255, +255, +255, +255, +255, +192, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255, +255));
 
 __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_adds_epu8
+  // CHECK-LABEL: test_mm512_mask_adds_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
   // CHECK: call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
@@ -1008,264 +1010,264 @@ __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
 TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
 
 __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_adds_epu8
+  // CHECK-LABEL: test_mm512_maskz_adds_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
   // CHECK: call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epu8(__U,__A,__B); 
 }
 __m512i test_mm512_adds_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_adds_epu16
+  // CHECK-LABEL: test_mm512_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
   // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_adds_epu16(__A,__B); 
 }
 __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_adds_epu16
+  // CHECK-LABEL: test_mm512_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
   // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epu16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_adds_epu16
+  // CHECK-LABEL: test_mm512_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
   // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_adds_epu16(__U,__A,__B); 
 }
 __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_avg_epu8
+  // CHECK-LABEL: test_mm512_avg_epu8
   // CHECK: @llvm.x86.avx512.pavg.b.512
   return _mm512_avg_epu8(__A,__B); 
 }
 __m512i test_mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_avg_epu8
+  // CHECK-LABEL: test_mm512_mask_avg_epu8
   // CHECK: @llvm.x86.avx512.pavg.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_avg_epu8(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_avg_epu8
+  // CHECK-LABEL: test_mm512_maskz_avg_epu8
   // CHECK: @llvm.x86.avx512.pavg.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_avg_epu8(__U,__A,__B); 
 }
 __m512i test_mm512_avg_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_avg_epu16
+  // CHECK-LABEL: test_mm512_avg_epu16
   // CHECK: @llvm.x86.avx512.pavg.w.512
   return _mm512_avg_epu16(__A,__B); 
 }
 __m512i test_mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_avg_epu16
+  // CHECK-LABEL: test_mm512_mask_avg_epu16
   // CHECK: @llvm.x86.avx512.pavg.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_avg_epu16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_avg_epu16
+  // CHECK-LABEL: test_mm512_maskz_avg_epu16
   // CHECK: @llvm.x86.avx512.pavg.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_avg_epu16(__U,__A,__B); 
 }
 __m512i test_mm512_max_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_max_epi8
+  // CHECK-LABEL: test_mm512_max_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_max_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_epi8
+  // CHECK-LABEL: test_mm512_maskz_max_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_epi8
+  // CHECK-LABEL: test_mm512_mask_max_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_max_epi16
+  // CHECK-LABEL: test_mm512_max_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_max_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_epi16
+  // CHECK-LABEL: test_mm512_maskz_max_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_epi16
+  // CHECK-LABEL: test_mm512_mask_max_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_max_epu8
+  // CHECK-LABEL: test_mm512_max_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_max_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_epu8
+  // CHECK-LABEL: test_mm512_maskz_max_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_max_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_epu8
+  // CHECK-LABEL: test_mm512_mask_max_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_max_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_max_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_max_epu16
+  // CHECK-LABEL: test_mm512_max_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_max_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_epu16
+  // CHECK-LABEL: test_mm512_maskz_max_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_max_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_epu16
+  // CHECK-LABEL: test_mm512_mask_max_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_max_epu16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_min_epi8
+  // CHECK-LABEL: test_mm512_min_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_min_epi8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_epi8
+  // CHECK-LABEL: test_mm512_maskz_min_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epi8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_epi8
+  // CHECK-LABEL: test_mm512_mask_min_epi8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epi8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_min_epi16
+  // CHECK-LABEL: test_mm512_min_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_min_epi16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_epi16
+  // CHECK-LABEL: test_mm512_maskz_min_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epi16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_epi16
+  // CHECK-LABEL: test_mm512_mask_min_epi16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epi16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_min_epu8
+  // CHECK-LABEL: test_mm512_min_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_min_epu8(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_epu8
+  // CHECK-LABEL: test_mm512_maskz_min_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_maskz_min_epu8(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_epu8
+  // CHECK-LABEL: test_mm512_mask_min_epu8
   // CHECK:       [[RES:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK:       select <64 x i1> {{.*}}, <64 x i8> [[RES]], <64 x i8> {{.*}}
   return _mm512_mask_min_epu8(__W,__M,__A,__B); 
 }
 __m512i test_mm512_min_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_min_epu16
+  // CHECK-LABEL: test_mm512_min_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_min_epu16(__A,__B); 
 }
 __m512i test_mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_epu16
+  // CHECK-LABEL: test_mm512_maskz_min_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_maskz_min_epu16(__M,__A,__B); 
 }
 __m512i test_mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_epu16
+  // CHECK-LABEL: test_mm512_mask_min_epu16
   // CHECK:       [[RES:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK:       select <32 x i1> {{.*}}, <32 x i16> [[RES]], <32 x i16> {{.*}}
   return _mm512_mask_min_epu16(__W,__M,__A,__B); 
 }
 __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_shuffle_epi8
+  // CHECK-LABEL: test_mm512_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   return _mm512_shuffle_epi8(__A,__B); 
 }
 __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_epi8
+  // CHECK-LABEL: test_mm512_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_epi8
+  // CHECK-LABEL: test_mm512_maskz_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_shuffle_epi8(__U,__A,__B); 
 }
 __m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_subs_epi8
+  // CHECK-LABEL: test_mm512_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
 return _mm512_subs_epi8(__A,__B); 
 }
-TEST_CONSTEXPR(match_v64qi(_mm512_subs_epi8((__m512i)(__v64qi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +100, +50, -100, +20, +80, -50, +120, -20, -100, -50, +100, -20, -80, +50, -120, +20}, (__m512i)(__v64qi){0, -1, +2, -3, +4, -5, +6, -7, +8, -9, +10, -11, +12, -13, +14, -15, +16, -17, +18, -19, +20, -21, +22, -23, +24, -25, +26, -27, +28, -29, +30, -31, +32, -33, +34, -35, +36, -37, +38, -39, +40, -41, +42, -43, +44, -45, +46, -47, -50, -80, +50, -110, -60, +30, -20, +10, -50, -80, +50, -110, -60, +30, -20, +10}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, -56, +58, -60, +62, -64, +66, -68, +70, -72, +74, -76, +78, -80, +82, -84, +86, -88, +90, -92, +94, +127, +127, -128, +127, +127, -80, +127, -30, -50, +30, +50, +90, -20, +20, -100, +10));
+TEST_CONSTEXPR(match_v64qi(_mm512_subs_epi8((__m512i)(__v64qs){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32, +33, -34, +35, -36, +37, -38, +39, -40, +41, -42, +43, -44, +45, -46, +47, +100, +50, -100, +20, +80, -50, +120, -20, -100, -50, +100, -20, -80, +50, -120, +20}, (__m512i)(__v64qs){0, -1, +2, -3, +4, -5, +6, -7, +8, -9, +10, -11, +12, -13, +14, -15, +16, -17, +18, -19, +20, -21, +22, -23, +24, -25, +26, -27, +28, -29, +30, -31, +32, -33, +34, -35, +36, -37, +38, -39, +40, -41, +42, -43, +44, -45, +46, -47, -50, -80, +50, -110, -60, +30, -20, +10, -50, -80, +50, -110, -60, +30, -20, +10}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, -56, +58, -60, +62, -64, +66, -68, +70, -72, +74, -76, +78, -80, +82, -84, +86, -88, +90, -92, +94, +127, +127, -128, +127, +127, -80, +127, -30, -50, +30, +50, +90, -20, +20, -100, +10));
 
 __m512i test_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_subs_epi8
+  // CHECK-LABEL: test_mm512_mask_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_mask_subs_epi8(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_subs_epi8
+  // CHECK-LABEL: test_mm512_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epi8(__U,__A,__B); 
 }
 __m512i test_mm512_subs_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_subs_epi16
+  // CHECK-LABEL: test_mm512_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
 return _mm512_subs_epi16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_subs_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, -1, +2, -3, +4, -5, +6, -7, +8, -9, +10, -11, +12, -13, +14, -15, +16, -17, +18, -19, +20, -21, +22, -23, +24, -25, +26, -27, -800, +800, +800, -800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200));
 __m512i test_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_subs_epi16
+  // CHECK-LABEL: test_mm512_mask_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_mask_subs_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_subs_epi16
+  // CHECK-LABEL: test_mm512_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epi16(__U,__A,__B); 
 }
 __m512i test_mm512_subs_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_subs_epu8
+  // CHECK-LABEL: test_mm512_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
   // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
 return _mm512_subs_epu8(__A,__B); 
 }
 __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_subs_epu8
+  // CHECK-LABEL: test_mm512_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
   // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
@@ -1274,20 +1276,20 @@ return _mm512_mask_subs_epu8(__W,__U,__A,__B);
 TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, 0, 0, 0, 0, 0, 0, 0, +63, 0, 0, 0, 0, 0, 0, 0, +64, +1, 0, 0, 0, 0, 0, 0, +127, +64, +63, 0, 0, 0, 0, 0, +128, +65, +64, +1, 0, 0, 0, 0, +191, +128, +127, +64, +63, 0, 0, 0, +192, +129, +128, +65, +64, +1, 0, 0, +255, +192, +191, +128, +127, +64, +63, +0));
 
 __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_subs_epu8
+  // CHECK-LABEL: test_mm512_maskz_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
   // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epu8(__U,__A,__B); 
 }
 __m512i test_mm512_subs_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_subs_epu16
+  // CHECK-LABEL: test_mm512_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 return _mm512_subs_epu16(__A,__B); 
 }
 __m512i test_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_subs_epu16
+  // CHECK-LABEL: test_mm512_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
@@ -1295,627 +1297,630 @@ return _mm512_mask_subs_epu16(__W,__U,__A,__B);
 TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
 }
 __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_subs_epu16
+  // CHECK-LABEL: test_mm512_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epu16(__U,__A,__B); 
 }
 __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi16
+  // CHECK-LABEL: test_mm512_mask2_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask2_permutex2var_epi16(__A,__I,__U,__B); 
 }
 __m512i test_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_permutex2var_epi16
+  // CHECK-LABEL: test_mm512_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
   return _mm512_permutex2var_epi16(__A,__I,__B); 
 }
 __m512i test_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi16
+  // CHECK-LABEL: test_mm512_mask_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_permutex2var_epi16(__A,__U,__I,__B); 
 }
 __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi16
+  // CHECK-LABEL: test_mm512_maskz_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
 __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mulhrs_epi16
+  // CHECK-LABEL: test_mm512_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   return _mm512_mulhrs_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A,        __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_mulhrs_epi16
+  // CHECK-LABEL: test_mm512_mask_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhrs_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mulhrs_epi16
+  // CHECK-LABEL: test_mm512_maskz_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhrs_epi16(__U,__A,__B); 
 }
 __m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mulhi_epi16
+  // CHECK-LABEL: test_mm512_mulhi_epi16
   // CHECK: @llvm.x86.avx512.pmulh.w.512
   return _mm512_mulhi_epi16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_mulhi_epi16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1));
 
 __m512i test_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,       __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_mulhi_epi16
+  // CHECK-LABEL: test_mm512_mask_mulhi_epi16
   // CHECK: @llvm.x86.avx512.pmulh.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhi_epi16(__W,__U,__A,__B); 
 }
 
 __m512i test_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mulhi_epi16
+  // CHECK-LABEL: test_mm512_maskz_mulhi_epi16
   // CHECK: @llvm.x86.avx512.pmulh.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhi_epi16(__U,__A,__B); 
 }
 
 __m512i test_mm512_mulhi_epu16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mulhi_epu16
+  // CHECK-LABEL: test_mm512_mulhi_epu16
   // CHECK: @llvm.x86.avx512.pmulhu.w.512
   return _mm512_mulhi_epu16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_mulhi_epu16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), 0, -64, 0, 57, 4, -60, 0, 49, 8, -56, 0, 41, 12, -52, 0, 33, 16, -48, 0, 25, 20, -44, 0, 17, 24, -40, 0, 9, 28, 5, 30, 1));
 
 __m512i test_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A,       __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_mulhi_epu16
+  // CHECK-LABEL: test_mm512_mask_mulhi_epu16
   // CHECK: @llvm.x86.avx512.pmulhu.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhi_epu16(__W,__U,__A,__B); 
 }
 
 __m512i test_mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mulhi_epu16
+  // CHECK-LABEL: test_mm512_maskz_mulhi_epu16
   // CHECK: @llvm.x86.avx512.pmulhu.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhi_epu16(__U,__A,__B); 
 }
 
 __m512i test_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maddubs_epi16
+  // CHECK-LABEL: test_mm512_maddubs_epi16
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   return _mm512_maddubs_epi16(__X,__Y); 
 }
 __m512i test_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,         __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_maddubs_epi16
+  // CHECK-LABEL: test_mm512_mask_maddubs_epi16
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_maddubs_epi16(__W,__U,__X,__Y); 
 }
 __m512i test_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_maddubs_epi16
+  // CHECK-LABEL: test_mm512_maskz_maddubs_epi16
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_maddubs_epi16(__U,__X,__Y); 
 }
 __m512i test_mm512_madd_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_madd_epi16
+  // CHECK-LABEL: test_mm512_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   return _mm512_madd_epi16(__A,__B); 
 }
 __m512i test_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A,      __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_madd_epi16
+  // CHECK-LABEL: test_mm512_mask_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_madd_epi16(__W,__U,__A,__B); 
 }
 __m512i test_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_madd_epi16
+  // CHECK-LABEL: test_mm512_maskz_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_madd_epi16(__U,__A,__B); 
 }
 
 __m256i test_mm512_cvtsepi16_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi16_epi8
+  // CHECK-LABEL: test_mm512_cvtsepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovs.wb.512
   return _mm512_cvtsepi16_epi8(__A); 
 }
 
 __m256i test_mm512_mask_cvtsepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi16_epi8
+  // CHECK-LABEL: test_mm512_mask_cvtsepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovs.wb.512
   return _mm512_mask_cvtsepi16_epi8(__O, __M, __A); 
 }
 
 __m256i test_mm512_maskz_cvtsepi16_epi8(__mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi16_epi8
+  // CHECK-LABEL: test_mm512_maskz_cvtsepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovs.wb.512
   return _mm512_maskz_cvtsepi16_epi8(__M, __A); 
 }
 
 __m256i test_mm512_cvtusepi16_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi16_epi8
+  // CHECK-LABEL: test_mm512_cvtusepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovus.wb.512
   return _mm512_cvtusepi16_epi8(__A); 
 }
 
 __m256i test_mm512_mask_cvtusepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi16_epi8
+  // CHECK-LABEL: test_mm512_mask_cvtusepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovus.wb.512
   return _mm512_mask_cvtusepi16_epi8(__O, __M, __A); 
 }
 
 __m256i test_mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi16_epi8
+  // CHECK-LABEL: test_mm512_maskz_cvtusepi16_epi8
   // CHECK: @llvm.x86.avx512.mask.pmovus.wb.512
   return _mm512_maskz_cvtusepi16_epi8(__M, __A); 
 }
 
 __m256i test_mm512_cvtepi16_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi16_epi8
+  // CHECK-LABEL: test_mm512_cvtepi16_epi8
   // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
   return _mm512_cvtepi16_epi8(__A); 
 }
 
 __m256i test_mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi8
+  // CHECK-LABEL: test_mm512_mask_cvtepi16_epi8
   // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm512_mask_cvtepi16_epi8(__O, __M, __A); 
 }
 
 __m256i test_mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi8
+  // CHECK-LABEL: test_mm512_maskz_cvtepi16_epi8
   // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm512_maskz_cvtepi16_epi8(__M, __A); 
 }
 
 __m512i test_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpackhi_epi8
+  // CHECK-LABEL: test_mm512_unpackhi_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   return _mm512_unpackhi_epi8(__A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_unpackhi_epi8((__m512i)(__v64qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}, (__m512i)(__v64qi){64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127}), 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95, 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127));
 
 __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi8
+  // CHECK-LABEL: test_mm512_mask_unpackhi_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi8
+  // CHECK-LABEL: test_mm512_maskz_unpackhi_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpackhi_epi16
+  // CHECK-LABEL: test_mm512_unpackhi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   return _mm512_unpackhi_epi16(__A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_unpackhi_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m512i)(__v32hi){32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}), 4, 36, 5, 37, 6, 38, 7, 39, 12, 44, 13, 45, 14, 46, 15, 47, 20, 52, 21, 53, 22, 54, 23, 55, 28, 60, 29, 61, 30, 62, 31, 63));
+
 
 __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi16
+  // CHECK-LABEL: test_mm512_mask_unpackhi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi16
+  // CHECK-LABEL: test_mm512_maskz_unpackhi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpacklo_epi8
+  // CHECK-LABEL: test_mm512_unpacklo_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   return _mm512_unpacklo_epi8(__A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_unpacklo_epi8((__m512i)(__v64qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}, (__m512i)(__v64qi){64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127}), 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119));
 
 __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi8
+  // CHECK-LABEL: test_mm512_mask_unpacklo_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi8
+  // CHECK-LABEL: test_mm512_maskz_unpacklo_epi8
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpacklo_epi16
+  // CHECK-LABEL: test_mm512_unpacklo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
   return _mm512_unpacklo_epi16(__A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_unpacklo_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m512i)(__v32hi){32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}), 0, 32, 1, 33, 2, 34, 3, 35, 8, 40, 9, 41, 10, 42, 11, 43, 16, 48, 17, 49, 18, 50, 19, 51, 24, 56, 25, 57, 26, 58, 27, 59));
 
 __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi16
+  // CHECK-LABEL: test_mm512_mask_unpacklo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi16
+  // CHECK-LABEL: test_mm512_maskz_unpacklo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_cvtepi8_epi16(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi8_epi16
+  // CHECK-LABEL: test_mm512_cvtepi8_epi16
   // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
   return _mm512_cvtepi8_epi16(__A); 
 }
-
-TEST_CONSTEXPR(match_v32hi(_mm512_cvtepi8_epi16(_mm256_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28));
+TEST_CONSTEXPR(match_v32hi(_mm512_cvtepi8_epi16((__m256i)(__v32qs){-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28}), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28));
 
 __m512i test_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi16
+  // CHECK-LABEL: test_mm512_mask_cvtepi8_epi16
   // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_cvtepi8_epi16(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi16
+  // CHECK-LABEL: test_mm512_maskz_cvtepi8_epi16
   // CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_cvtepi8_epi16(__U, __A); 
 }
 
 __m512i test_mm512_cvtepu8_epi16(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu8_epi16
+  // CHECK-LABEL: test_mm512_cvtepu8_epi16
   // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
   return _mm512_cvtepu8_epi16(__A); 
 }
-
-TEST_CONSTEXPR(match_v32hi(_mm512_cvtepu8_epi16(_mm256_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244, 13, 242, 15, 240, 17, 238, 19, 236, 21, 234, 23, 232, 25, 230, 27, 228));
+TEST_CONSTEXPR(match_v32hi(_mm512_cvtepu8_epi16((__m256i)(__v32qs){-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, 17, -18, 19, -20, 21, -22, 23, -24, 25, -26, 27, -28}), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244, 13, 242, 15, 240, 17, 238, 19, 236, 21, 234, 23, 232, 25, 230, 27, 228));
 
 __m512i test_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi16
+  // CHECK-LABEL: test_mm512_mask_cvtepu8_epi16
   // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_cvtepu8_epi16(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi16
+  // CHECK-LABEL: test_mm512_maskz_cvtepu8_epi16
   // CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_cvtepu8_epi16(__U, __A); 
 }
 
 __m512i test_mm512_shufflehi_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_shufflehi_epi16
+  // CHECK-LABEL: test_mm512_shufflehi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   return _mm512_shufflehi_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_shufflehi_epi16
+  // CHECK-LABEL: test_mm512_mask_shufflehi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_shufflehi_epi16
+  // CHECK-LABEL: test_mm512_maskz_shufflehi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_shufflelo_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_shufflelo_epi16
+  // CHECK-LABEL: test_mm512_shufflelo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   return _mm512_shufflelo_epi16(__A, 5); 
 }
 
 __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_shufflelo_epi16
+  // CHECK-LABEL: test_mm512_mask_shufflelo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_shufflelo_epi16
+  // CHECK-LABEL: test_mm512_maskz_shufflelo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflelo_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_sllv_epi16
+  // CHECK-LABEL: test_mm512_sllv_epi16
   // CHECK: @llvm.x86.avx512.psllv.w.512(
   return _mm512_sllv_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sllv_epi16
+  // CHECK-LABEL: test_mm512_mask_sllv_epi16
   // CHECK: @llvm.x86.avx512.psllv.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sllv_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sllv_epi16
+  // CHECK-LABEL: test_mm512_maskz_sllv_epi16
   // CHECK: @llvm.x86.avx512.psllv.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sllv_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sll_epi16
+  // CHECK-LABEL: test_mm512_sll_epi16
   // CHECK: @llvm.x86.avx512.psll.w.512
   return _mm512_sll_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sll_epi16
+  // CHECK-LABEL: test_mm512_mask_sll_epi16
   // CHECK: @llvm.x86.avx512.psll.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sll_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sll_epi16
+  // CHECK-LABEL: test_mm512_maskz_sll_epi16
   // CHECK: @llvm.x86.avx512.psll.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sll_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_slli_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_slli_epi16
+  // CHECK-LABEL: test_mm512_slli_epi16
   // CHECK: @llvm.x86.avx512.pslli.w.512
   return _mm512_slli_epi16(__A, 5); 
 }
 
 __m512i test_mm512_slli_epi16_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_slli_epi16_2
+  // CHECK-LABEL: test_mm512_slli_epi16_2
   // CHECK: @llvm.x86.avx512.pslli.w.512
   return _mm512_slli_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi16
+  // CHECK-LABEL: test_mm512_mask_slli_epi16
   // CHECK: @llvm.x86.avx512.pslli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_slli_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_mask_slli_epi16_2(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi16_2
+  // CHECK-LABEL: test_mm512_mask_slli_epi16_2
   // CHECK: @llvm.x86.avx512.pslli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_slli_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi16
+  // CHECK-LABEL: test_mm512_maskz_slli_epi16
   // CHECK: @llvm.x86.avx512.pslli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_slli_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_slli_epi16_2(__mmask32 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi16_2
+  // CHECK-LABEL: test_mm512_maskz_slli_epi16_2
   // CHECK: @llvm.x86.avx512.pslli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_slli_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_bslli_epi128(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_bslli_epi128
+  // CHECK-LABEL: test_mm512_bslli_epi128
   // CHECK: shufflevector <64 x i8> zeroinitializer, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
   return _mm512_bslli_epi128(__A, 5);
 }
 
 __m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_srlv_epi16
+  // CHECK-LABEL: test_mm512_srlv_epi16
   // CHECK: @llvm.x86.avx512.psrlv.w.512(
   return _mm512_srlv_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_srlv_epi16
+  // CHECK-LABEL: test_mm512_mask_srlv_epi16
   // CHECK: @llvm.x86.avx512.psrlv.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srlv_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srlv_epi16
+  // CHECK-LABEL: test_mm512_maskz_srlv_epi16
   // CHECK: @llvm.x86.avx512.psrlv.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srlv_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srav_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_srav_epi16
+  // CHECK-LABEL: test_mm512_srav_epi16
   // CHECK: @llvm.x86.avx512.psrav.w.512(
   return _mm512_srav_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_srav_epi16
+  // CHECK-LABEL: test_mm512_mask_srav_epi16
   // CHECK: @llvm.x86.avx512.psrav.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srav_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srav_epi16
+  // CHECK-LABEL: test_mm512_maskz_srav_epi16
   // CHECK: @llvm.x86.avx512.psrav.w.512(
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srav_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sra_epi16
+  // CHECK-LABEL: test_mm512_sra_epi16
   // CHECK: @llvm.x86.avx512.psra.w.512
   return _mm512_sra_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sra_epi16
+  // CHECK-LABEL: test_mm512_mask_sra_epi16
   // CHECK: @llvm.x86.avx512.psra.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_sra_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sra_epi16
+  // CHECK-LABEL: test_mm512_maskz_sra_epi16
   // CHECK: @llvm.x86.avx512.psra.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_sra_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srai_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srai_epi16
+  // CHECK-LABEL: test_mm512_srai_epi16
   // CHECK: @llvm.x86.avx512.psrai.w.512
   return _mm512_srai_epi16(__A, 5); 
 }
 
 __m512i test_mm512_srai_epi16_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srai_epi16_2
+  // CHECK-LABEL: test_mm512_srai_epi16_2
   // CHECK: @llvm.x86.avx512.psrai.w.512
   return _mm512_srai_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi16
+  // CHECK-LABEL: test_mm512_mask_srai_epi16
   // CHECK: @llvm.x86.avx512.psrai.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srai_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_mask_srai_epi16_2(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi16_2
+  // CHECK-LABEL: test_mm512_mask_srai_epi16_2
   // CHECK: @llvm.x86.avx512.psrai.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srai_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi16
+  // CHECK-LABEL: test_mm512_maskz_srai_epi16
   // CHECK: @llvm.x86.avx512.psrai.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srai_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srai_epi16_2(__mmask32 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi16_2
+  // CHECK-LABEL: test_mm512_maskz_srai_epi16_2
   // CHECK: @llvm.x86.avx512.psrai.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srai_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_srl_epi16
+  // CHECK-LABEL: test_mm512_srl_epi16
   // CHECK: @llvm.x86.avx512.psrl.w.512
   return _mm512_srl_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_srl_epi16
+  // CHECK-LABEL: test_mm512_mask_srl_epi16
   // CHECK: @llvm.x86.avx512.psrl.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srl_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srl_epi16
+  // CHECK-LABEL: test_mm512_maskz_srl_epi16
   // CHECK: @llvm.x86.avx512.psrl.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srl_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_srli_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srli_epi16
+  // CHECK-LABEL: test_mm512_srli_epi16
   // CHECK: @llvm.x86.avx512.psrli.w.512
   return _mm512_srli_epi16(__A, 5); 
 }
 
 __m512i test_mm512_srli_epi16_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srli_epi16_2
+  // CHECK-LABEL: test_mm512_srli_epi16_2
   // CHECK: @llvm.x86.avx512.psrli.w.512
   return _mm512_srli_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi16
+  // CHECK-LABEL: test_mm512_mask_srli_epi16
   // CHECK: @llvm.x86.avx512.psrli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srli_epi16(__W, __U, __A, 5); 
 }
 
 __m512i test_mm512_mask_srli_epi16_2(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi16_2
+  // CHECK-LABEL: test_mm512_mask_srli_epi16_2
   // CHECK: @llvm.x86.avx512.psrli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_srli_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi16
+  // CHECK-LABEL: test_mm512_maskz_srli_epi16
   // CHECK: @llvm.x86.avx512.psrli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srli_epi16(__U, __A, 5); 
 }
 
 __m512i test_mm512_maskz_srli_epi16_2(__mmask32 __U, __m512i __A, int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi16_2
+  // CHECK-LABEL: test_mm512_maskz_srli_epi16_2
   // CHECK: @llvm.x86.avx512.psrli.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_srli_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_bsrli_epi128(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_bsrli_epi128
+  // CHECK-LABEL: test_mm512_bsrli_epi128
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
   return _mm512_bsrli_epi128(__A, 5);
 }
 __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_epi16
+  // CHECK-LABEL: test_mm512_mask_mov_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mov_epi16(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_epi16
+  // CHECK-LABEL: test_mm512_maskz_mov_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mov_epi16(__U, __A); 
 }
 
 __m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_epi8
+  // CHECK-LABEL: test_mm512_mask_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_mov_epi8(__W, __U, __A); 
 }
 
 __m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_epi8
+  // CHECK-LABEL: test_mm512_maskz_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_mov_epi8(__U, __A); 
 }
 
 __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
-  // CHECK-LABEL: @test_mm512_mask_set1_epi8
+  // CHECK-LABEL: test_mm512_mask_set1_epi8
   // CHECK: insertelement <64 x i8> poison, i8 %{{.*}}, i32 0
   // CHECK: insertelement <64 x i8> %{{.*}}, i8 %{{.*}}, i32 1
   // CHECK: insertelement <64 x i8> %{{.*}}, i8 %{{.*}}, i32 2
@@ -1983,7 +1988,7 @@ __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
 }
 
 __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
-  // CHECK-LABEL: @test_mm512_maskz_set1_epi8
+  // CHECK-LABEL: test_mm512_maskz_set1_epi8
   // CHECK: insertelement <64 x i8> poison, i8 %{{.*}}, i32 0
   // CHECK: insertelement <64 x i8> %{{.*}}, i8 %{{.*}}, i32 1
   // CHECK: insertelement <64 x i8> %{{.*}}, i8 %{{.*}}, i32 2
@@ -2053,7 +2058,7 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
 }
 
 __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kunpackd
+  // CHECK-LABEL: test_mm512_kunpackd
   // CHECK: [[LHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK: [[LHS2:%.*]] = shufflevector <64 x i1> [[LHS]], <64 x i1> [[LHS]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -2063,7 +2068,7 @@ __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D
 }
 
 __mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kunpackw
+  // CHECK-LABEL: test_mm512_kunpackw
   // CHECK: [[LHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[RHS:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK: [[LHS2:%.*]] = shufflevector <32 x i1> [[LHS]], <32 x i1> [[LHS]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -2074,74 +2079,74 @@ __mmask32 test_mm512_kunpackw(__m512i __A, __m512i __B, __m512i __C, __m512i __D
 
 __m512i test_mm512_loadu_epi16 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_loadu_epi16
+  // CHECK-LABEL: test_mm512_loadu_epi16
   // CHECK: load <8 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm512_loadu_epi16 (__P);
 }
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_loadu_epi16
+  // CHECK-LABEL: test_mm512_mask_loadu_epi16
   // CHECK: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_mask_loadu_epi16(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_loadu_epi16
+  // CHECK-LABEL: test_mm512_maskz_loadu_epi16
   // CHECK: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_maskz_loadu_epi16(__U, __P); 
 }
 
 __m512i test_mm512_loadu_epi8 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_loadu_epi8
+  // CHECK-LABEL: test_mm512_loadu_epi8
   // CHECK: load <8 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm512_loadu_epi8 (__P);
 }
 
 __m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_loadu_epi8
+  // CHECK-LABEL: test_mm512_mask_loadu_epi8
   // CHECK: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_mask_loadu_epi8(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_loadu_epi8
+  // CHECK-LABEL: test_mm512_maskz_loadu_epi8
   // CHECK: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_maskz_loadu_epi8(__U, __P); 
 }
 
 void test_mm512_storeu_epi16(void *__P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_storeu_epi16
+  // CHECK-LABEL: test_mm512_storeu_epi16
   // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   return _mm512_storeu_epi16(__P, __A); 
 }
 
 void test_mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_storeu_epi16
+  // CHECK-LABEL: test_mm512_mask_storeu_epi16
   // CHECK: @llvm.masked.store.v32i16.p0(<32 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <32 x i1> %{{.*}})
   return _mm512_mask_storeu_epi16(__P, __U, __A);
 }
 
 __mmask64 test_mm512_test_epi8_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_test_epi8_mask
+  // CHECK-LABEL: test_mm512_test_epi8_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return _mm512_test_epi8_mask(__A, __B); 
 }
 
 void test_mm512_storeu_epi8(void *__P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_storeu_epi8
+  // CHECK-LABEL: test_mm512_storeu_epi8
   // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   return _mm512_storeu_epi8(__P, __A);
 }
 
 void test_mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_storeu_epi8
+  // CHECK-LABEL: test_mm512_mask_storeu_epi8
   // CHECK: @llvm.masked.store.v64i8.p0(<64 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <64 x i1> %{{.*}})
   return _mm512_mask_storeu_epi8(__P, __U, __A); 
 }
 __mmask64 test_mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_test_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_test_epi8_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
@@ -2149,14 +2154,14 @@ __mmask64 test_mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B
 }
 
 __mmask32 test_mm512_test_epi16_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_test_epi16_mask
+  // CHECK-LABEL: test_mm512_test_epi16_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return _mm512_test_epi16_mask(__A, __B); 
 }
 
 __mmask32 test_mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_test_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_test_epi16_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
@@ -2164,14 +2169,14 @@ __mmask32 test_mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __
 }
 
 __mmask64 test_mm512_testn_epi8_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_testn_epi8_mask
+  // CHECK-LABEL: test_mm512_testn_epi8_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return _mm512_testn_epi8_mask(__A, __B); 
 }
 
 __mmask64 test_mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_testn_epi8_mask
+  // CHECK-LABEL: test_mm512_mask_testn_epi8_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
@@ -2179,14 +2184,14 @@ __mmask64 test_mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __
 }
 
 __mmask32 test_mm512_testn_epi16_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_testn_epi16_mask
+  // CHECK-LABEL: test_mm512_testn_epi16_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return _mm512_testn_epi16_mask(__A, __B); 
 }
 
 __mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_testn_epi16_mask
+  // CHECK-LABEL: test_mm512_mask_testn_epi16_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
@@ -2194,67 +2199,69 @@ __mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i _
 }
 
 __mmask64 test_mm512_movepi8_mask(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_movepi8_mask
+  // CHECK-LABEL: test_mm512_movepi8_mask
   // CHECK: [[CMP:%.*]] = icmp slt <64 x i8> %{{.*}}, zeroinitializer
   return _mm512_movepi8_mask(__A); 
 }
 
 __m512i test_mm512_movm_epi8(__mmask64 __A) {
-  // CHECK-LABEL: @test_mm512_movm_epi8
+  // CHECK-LABEL: test_mm512_movm_epi8
   // CHECK:  %{{.*}} = bitcast i64 %{{.*}} to <64 x i1>
   // CHECK:  %vpmovm2.i = sext <64 x i1> %{{.*}} to <64 x i8>
   return _mm512_movm_epi8(__A); 
 }
 
 __m512i test_mm512_movm_epi16(__mmask32 __A) {
-  // CHECK-LABEL: @test_mm512_movm_epi16
+  // CHECK-LABEL: test_mm512_movm_epi16
   // CHECK:  %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
   // CHECK:  %vpmovm2.i = sext <32 x i1> %{{.*}} to <32 x i16>
   return _mm512_movm_epi16(__A); 
 }
 
 __m512i test_mm512_broadcastb_epi8(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_broadcastb_epi8
+  // CHECK-LABEL: test_mm512_broadcastb_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   return _mm512_broadcastb_epi8(__A);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_broadcastb_epi8((__m128i)(__v16qi){42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastb_epi8
+  // CHECK-LABEL: test_mm512_mask_broadcastb_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_broadcastb_epi8(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastb_epi8
+  // CHECK-LABEL: test_mm512_maskz_broadcastb_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_broadcastb_epi8(__M, __A);
 }
 
 __m512i test_mm512_broadcastw_epi16(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_broadcastw_epi16
+  // CHECK-LABEL: test_mm512_broadcastw_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   return _mm512_broadcastw_epi16(__A);
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
 
 __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastw_epi16
+  // CHECK-LABEL: test_mm512_mask_broadcastw_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_broadcastw_epi16(__O, __M, __A);
 }
 
 __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastw_epi16
+  // CHECK-LABEL: test_mm512_maskz_broadcastw_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <32 x i32> zeroinitializer
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_broadcastw_epi16(__M, __A);
 }
 
 __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
-  // CHECK-LABEL: @test_mm512_mask_set1_epi16
+  // CHECK-LABEL: test_mm512_mask_set1_epi16
   // CHECK: insertelement <32 x i16> poison, i16 %{{.*}}, i32 0
   // CHECK: insertelement <32 x i16> %{{.*}}, i16 %{{.*}}, i32 1
   // CHECK: insertelement <32 x i16> %{{.*}}, i16 %{{.*}}, i32 2
@@ -2292,7 +2299,7 @@ __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
 }
 
 __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
-  // CHECK-LABEL: @test_mm512_maskz_set1_epi16
+  // CHECK-LABEL: test_mm512_maskz_set1_epi16
   // CHECK: insertelement <32 x i16> poison, i16 %{{.*}}, i32 0
   // CHECK: insertelement <32 x i16> %{{.*}}, i16 %{{.*}}, i32 1
   // CHECK: insertelement <32 x i16> %{{.*}}, i16 %{{.*}}, i32 2
@@ -2329,39 +2336,39 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   return _mm512_maskz_set1_epi16(__M, __A); 
 }
 __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_permutexvar_epi16
+  // CHECK-LABEL: test_mm512_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.512
  return _mm512_permutexvar_epi16(__A, __B); 
 }
 
 __m512i test_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
- // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi16
+ // CHECK-LABEL: test_mm512_maskz_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_permutexvar_epi16(__M, __A, __B); 
 }
 
 __m512i test_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi16
+  // CHECK-LABEL: test_mm512_mask_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); 
 }
 __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
-    // CHECK-LABEL: @test_mm512_alignr_epi8
+    // CHECK-LABEL: test_mm512_alignr_epi8
     // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
     return _mm512_alignr_epi8(__A, __B, 2); 
 }
 
 __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
-    // CHECK-LABEL: @test_mm512_mask_alignr_epi8
+    // CHECK-LABEL: test_mm512_mask_alignr_epi8
     // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
 
 __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
-    // CHECK-LABEL: @test_mm512_maskz_alignr_epi8
+    // CHECK-LABEL: test_mm512_maskz_alignr_epi8
     // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
    return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
@@ -2370,54 +2377,54 @@ __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
 
 
 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mm_dbsad_epu8
+  // CHECK-LABEL: test_mm512_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   return _mm512_dbsad_epu8(__A, __B, 170); 
 }
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mm_mask_dbsad_epu8
+  // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
 }
 
 __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mm_maskz_dbsad_epu8
+  // CHECK-LABEL: test_mm512_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
 }
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_sad_epu8
+  // CHECK-LABEL: test_mm512_sad_epu8
   // CHECK: @llvm.x86.avx512.psad.bw.512
   return _mm512_sad_epu8(__A, __B); 
 }
 
 __mmask32 test_mm512_movepi16_mask(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_movepi16_mask
+  // CHECK-LABEL: test_mm512_movepi16_mask
   // CHECK: [[CMP:%.*]] = icmp slt <32 x i16> %{{.*}}, zeroinitializer
   return _mm512_movepi16_mask(__A); 
 }
 
 void test_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
 {
- // CHECK-LABEL: @test_mm512_mask_cvtepi16_storeu_epi8
+ // CHECK-LABEL: test_mm512_mask_cvtepi16_storeu_epi8
  // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.512
  _mm512_mask_cvtepi16_storeu_epi8 ( __P,  __M, __A);
 }
 
 void test_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
 {
- // CHECK-LABEL: @test_mm512_mask_cvtsepi16_storeu_epi8
+ // CHECK-LABEL: test_mm512_mask_cvtsepi16_storeu_epi8
  // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.512
  _mm512_mask_cvtsepi16_storeu_epi8 ( __P,  __M, __A);
 }
 
 void test_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
 {
- // CHECK-LABEL: @test_mm512_mask_cvtusepi16_storeu_epi8
+ // CHECK-LABEL: test_mm512_mask_cvtusepi16_storeu_epi8
  // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.512
  _mm512_mask_cvtusepi16_storeu_epi8 ( __P, __M, __A);
 }
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 1c01695..47e5a3f 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -355,7 +355,7 @@ __m512d test_mm512_and_pd (__m512d __A, __m512d __B) {
   // CHECK: and <8 x i64>
   return (__m512d) _mm512_and_pd(__A, __B);
 }
-TEST_CONSTEXPR(match_m512d(_mm512_and_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), -0.0, -0.0, +0.0, +7.0, +7.0, +0.0, -0.0, -0.0));
+TEST_CONSTEXPR(match_m512d(_mm512_and_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), +0.0, -0.0, +0.0, +7.0, +7.0, +0.0, -0.0, +0.0));
 
 __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: test_mm512_mask_and_pd
@@ -378,7 +378,7 @@ __m512 test_mm512_and_ps (__m512 __A, __m512 __B) {
   // CHECK: and <16 x i32>
   return (__m512) _mm512_and_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_and_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f, -0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, -0.0f));
+TEST_CONSTEXPR(match_m512(_mm512_and_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, +0.0f, +0.0f, -0.0f, +0.0f, +7.0f, +7.0f, +0.0f, -0.0f, +0.0f));
 
 __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: test_mm512_mask_and_ps
@@ -402,7 +402,7 @@ __m512d test_mm512_andnot_pd (__m512d __A, __m512d __B) {
   // CHECK: and <8 x i64>
   return (__m512d) _mm512_andnot_pd(__A, __B);
 }
-TEST_CONSTEXPR(match_m512d(_mm512_andnot_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), +0.0, +0.0, +0.0, +0.0, +0.0, +0.0, +0.0, +0.0));
+TEST_CONSTEXPR(match_m512d(_mm512_andnot_pd((__m512d){-4.0, -5.0, +6.0, +7.0, +7.0, +6.0, -5.0, -4.0}, (__m512d){+0.0, -0.0, -0.0, +7.0, +7.0, -0.0, -0.0, +0.0}), +0.0, +0.0, -0.0, +0.0, +0.0, -0.0, +0.0, +0.0));
 
 __m512d test_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: test_mm512_mask_andnot_pd
@@ -426,7 +426,7 @@ __m512 test_mm512_andnot_ps (__m512 __A, __m512 __B) {
   // CHECK: and <16 x i32>
   return (__m512) _mm512_andnot_ps(__A, __B);
 }
-TEST_CONSTEXPR(match_m512(_mm512_andnot_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f, +0.0f));
+TEST_CONSTEXPR(match_m512(_mm512_andnot_ps((__m512){-4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f, -4.0f, -5.0f, +6.0f, +7.0f, +7.0f, +6.0f, -5.0f, -4.0f}, (__m512){+0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f, +0.0f, -0.0f, -0.0f, +7.0f, +7.0f, -0.0f, -0.0f, +0.0f}), +0.0f, +0.0f, -0.0f, +0.0f, +0.0f, -0.0f, +0.0f, +0.0f, +0.0f, +0.0f, -0.0f, +0.0f, +0.0f, -0.0f, +0.0f, +0.0f));
 
 __m512 test_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: test_mm512_mask_andnot_ps
@@ -1211,6 +1211,7 @@ __m512 test_mm512_broadcast_f32x2(__m128 __A) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_f32x2(__A); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_broadcast_f32x2((__m128)(__v4sf){1.0, -2.0, 3.0, -4.0}), 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0));
 
 __m512 test_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_f32x2
@@ -1231,6 +1232,7 @@ __m512 test_mm512_broadcast_f32x8(float const* __A) {
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_broadcast_f32x8(_mm256_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_broadcast_f32x8((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 1.0f, 2.0f, 3.0f, 4.0f, -5.0f, -6.0f, -7.0f, -8.0f, 1.0f, 2.0f, 3.0f, 4.0f, -5.0f, -6.0f, -7.0f, -8.0f));
 
 __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_f32x8
@@ -1251,6 +1253,7 @@ __m512d test_mm512_broadcast_f64x2(double const* __A) {
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_f64x2(_mm_loadu_pd(__A)); 
 }
+TEST_CONSTEXPR(match_m512d(_mm512_broadcast_f64x2((__m128d)(__v2df){1.0, -2.0}), 1.0, -2.0, 1.0, -2.0, 1.0, -2.0, 1.0, -2.0));
 
 __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_f64x2
@@ -1271,6 +1274,7 @@ __m512i test_mm512_broadcast_i32x2(__m128i __A) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_i32x2(__A); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_broadcast_i32x2((__m128i)(__v4si){1, -2, 3, -4}), 1, -2, 1, -2, 1, -2, 1, -2, 1, -2, 1, -2, 1, -2, 1, -2));
 
 __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_i32x2
@@ -1291,6 +1295,7 @@ __m512i test_mm512_broadcast_i32x8(__m256i const* __A) {
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_broadcast_i32x8(_mm256_loadu_si256(__A)); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_broadcast_i32x8((__m256i)(__v8si){1, 2, 3, 4, -5, -6, -7, -8}), 1, 2, 3, 4, -5, -6, -7, -8, 1, 2, 3, 4, -5, -6, -7, -8));
 
 __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_i32x8
@@ -1311,6 +1316,7 @@ __m512i test_mm512_broadcast_i64x2(__m128i const* __A) {
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm512_broadcast_i64x2(_mm_loadu_si128(__A)); 
 }
+TEST_CONSTEXPR(match_v8di(_mm512_broadcast_i64x2((__m128i)(__v2di){1, -2}), 1, -2, 1, -2, 1, -2, 1, -2));
 
 __m512i test_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_i64x2
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 048bc30..b8eadc4 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -4691,6 +4691,7 @@ __m512i test_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   return _mm512_unpackhi_epi32(__A, __B); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_unpackhi_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m512i)(__v16si){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 2, 18, 3, 19, 6, 22, 7, 23, 10, 26, 11, 27, 14, 30, 15, 31));
 
 __m512d test_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: test_mm512_maskz_unpackhi_pd
@@ -5477,6 +5478,7 @@ __m512i test_mm512_unpackhi_epi64(__m512i __A, __m512i __B) {
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   return _mm512_unpackhi_epi64(__A, __B); 
 }
+TEST_CONSTEXPR(match_m512i(_mm512_unpackhi_epi64((__m512i){0, 1, 2, 3, 4, 5, 6, 7}, (__m512i){8, 9, 10, 11, 12, 13, 14, 15}), 1, 9, 3, 11, 5, 13, 7, 15));
 
 __m512i test_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_unpackhi_epi64
@@ -5497,6 +5499,7 @@ __m512i test_mm512_unpacklo_epi32(__m512i __A, __m512i __B) {
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   return _mm512_unpacklo_epi32(__A, __B); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_unpacklo_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m512i)(__v16si){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29));
 
 __m512i test_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_unpacklo_epi32
@@ -5517,6 +5520,7 @@ __m512i test_mm512_unpacklo_epi64(__m512i __A, __m512i __B) {
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   return _mm512_unpacklo_epi64(__A, __B); 
 }
+TEST_CONSTEXPR(match_m512i(_mm512_unpacklo_epi64((__m512i){0, 1, 2, 3, 4, 5, 6, 7}, (__m512i){8, 9, 10, 11, 12, 13, 14, 15}), 0, 8, 2, 10, 4, 12, 6, 14));
 
 __m512i test_mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_unpacklo_epi64
@@ -6307,6 +6311,7 @@ __m512 test_mm512_broadcast_f32x4(float const* __A) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm512_broadcast_f32x4(_mm_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_broadcast_f32x4((__m128)(__v4sf){1.0f, 2.0f, -3.0f, -4.0f}), 1.0f, 2.0f, -3.0f, -4.0f, 1.0f, 2.0f, -3.0f, -4.0f, 1.0f, 2.0f, -3.0f, -4.0f, 1.0f, 2.0f, -3.0f, -4.0f));
 
 __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_f32x4
@@ -6327,6 +6332,7 @@ __m512d test_mm512_broadcast_f64x4(double const* __A) {
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm512_broadcast_f64x4(_mm256_loadu_pd(__A)); 
 }
+TEST_CONSTEXPR(match_m512d(_mm512_broadcast_f64x4((__m256d)(__v4df){1.0, 2.0, -3.0, -4.0}), 1.0, 2.0, -3.0, -4.0, 1.0, 2.0, -3.0, -4.0));
 
 __m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_f64x4
@@ -6347,6 +6353,7 @@ __m512i test_mm512_broadcast_i32x4(__m128i const* __A) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm512_broadcast_i32x4(_mm_loadu_si128(__A)); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_broadcast_i32x4((__m128i)(__v4si){1, 2, -3, -4}), 1, 2, -3, -4, 1, 2, -3, -4, 1, 2, -3, -4, 1, 2, -3, -4));
 
 __m512i test_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_i32x4
@@ -6367,6 +6374,7 @@ __m512i test_mm512_broadcast_i64x4(__m256i const* __A) {
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm512_broadcast_i64x4(_mm256_loadu_si256(__A)); 
 }
+TEST_CONSTEXPR(match_v8di(_mm512_broadcast_i64x4((__m256i)(__v4di){1, 2, -3, -4}), 1, 2, -3, -4, 1, 2, -3, -4));
 
 __m512i test_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i const* __A) {
   // CHECK-LABEL: test_mm512_mask_broadcast_i64x4
@@ -6387,6 +6395,7 @@ __m512d test_mm512_broadcastsd_pd(__m128d __A) {
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastsd_pd(__A);
 }
+TEST_CONSTEXPR(match_m512d(_mm512_broadcastsd_pd((__m128d)(__v2df){1.0, 2.0}), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0));
 
 __m512d test_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastsd_pd
@@ -6407,6 +6416,7 @@ __m512 test_mm512_broadcastss_ps(__m128 __A) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastss_ps(__A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_broadcastss_ps((__m128)(__v4sf){1.0f, 2.0f, -3.0f, -4.0f}), 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f));
 
 __m512 test_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastss_ps
@@ -6427,6 +6437,7 @@ __m512i test_mm512_broadcastd_epi32(__m128i __A) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastd_epi32(__A);
 }
+TEST_CONSTEXPR(match_v16si(_mm512_broadcastd_epi32((__m128i)(__v4si){-42, 0, 0, 0}), -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42));
 
 __m512i test_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastd_epi32
@@ -6447,6 +6458,7 @@ __m512i test_mm512_broadcastq_epi64(__m128i __A) {
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastq_epi64(__A);
 }
+TEST_CONSTEXPR(match_v8di(_mm512_broadcastq_epi64((__m128i)(__v2di){-42, 0}), -42, -42, -42, -42, -42, -42, -42, -42));
 
 __m512i test_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastq_epi64
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index ac7aa3e..f1ff210b0 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: test_mm_cmpeq_epu32_mask
@@ -8121,6 +8122,7 @@ __m256 test_mm256_broadcast_f32x4(__m128 __A) {
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm256_broadcast_f32x4(__A); 
 }
+TEST_CONSTEXPR(match_m256(_mm256_broadcast_f32x4((__m128)(__v4sf){1.0f, 3.0f, -5.0f, -8.0f}), 1.0f, 3.0f, -5.0f, -8.0f, 1.0f, 3.0f, -5.0f, -8.0f));
 
 __m256 test_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) {
   // CHECK-LABEL: test_mm256_mask_broadcast_f32x4
@@ -8141,6 +8143,7 @@ __m256i test_mm256_broadcast_i32x4(__m128i const* __A) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm256_broadcast_i32x4(_mm_loadu_si128(__A)); 
 }
+TEST_CONSTEXPR(match_v8si(_mm256_broadcast_i32x4((__m128i)(__v4si){1, 3, -5, -8}), 1, 3, -5, -8, 1, 3, -5, -8));
 
 __m256i test_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i const* __A) {
   // CHECK-LABEL: test_mm256_mask_broadcast_i32x4
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index cdbd19a..66ba0c7 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -1,50 +1,54 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m256i test_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm256_mullo_epi64
+  // CHECK-LABEL: test_mm256_mullo_epi64
   // CHECK: mul <4 x i64>
   return _mm256_mullo_epi64(__A, __B);
 }
 
 __m256i test_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm256_mask_mullo_epi64
+  // CHECK-LABEL: test_mm256_mask_mullo_epi64
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return (__m256i) _mm256_mask_mullo_epi64 ( __W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm256_maskz_mullo_epi64
+  // CHECK-LABEL: test_mm256_maskz_mullo_epi64
   // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return (__m256i) _mm256_maskz_mullo_epi64 (__U, __A, __B);
 }
 
 __m128i test_mm_mullo_epi64 (__m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm_mullo_epi64
+  // CHECK-LABEL: test_mm_mullo_epi64
   // CHECK: mul <2 x i64>
   return (__m128i) _mm_mullo_epi64(__A, __B);
 }
 
 __m128i test_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm_mask_mullo_epi64
+  // CHECK-LABEL: test_mm_mask_mullo_epi64
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return (__m128i) _mm_mask_mullo_epi64 ( __W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm_maskz_mullo_epi64
+  // CHECK-LABEL: test_mm_maskz_mullo_epi64
   // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return (__m128i) _mm_maskz_mullo_epi64 (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_andnot_pd
+  // CHECK-LABEL: test_mm256_mask_andnot_pd
   // CHECK: xor <4 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
@@ -52,7 +56,7 @@ __m256d test_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m25
 }
 
 __m256d test_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_andnot_pd
+  // CHECK-LABEL: test_mm256_maskz_andnot_pd
   // CHECK: xor <4 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
@@ -60,7 +64,7 @@ __m256d test_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) {
 }
 
 __m128d test_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_andnot_pd
+  // CHECK-LABEL: test_mm_mask_andnot_pd
   // CHECK: xor <2 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
@@ -68,7 +72,7 @@ __m128d test_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d
 }
 
 __m128d test_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_andnot_pd
+  // CHECK-LABEL: test_mm_maskz_andnot_pd
   // CHECK: xor <2 x i64> %{{.*}}, splat (i64 -1)
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
@@ -76,7 +80,7 @@ __m128d test_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) {
 }
 
 __m256 test_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_andnot_ps
+  // CHECK-LABEL: test_mm256_mask_andnot_ps
   // CHECK: xor <8 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
@@ -84,7 +88,7 @@ __m256 test_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 _
 }
 
 __m256 test_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_maskz_andnot_ps
+  // CHECK-LABEL: test_mm256_maskz_andnot_ps
   // CHECK: xor <8 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
@@ -92,7 +96,7 @@ __m256 test_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) {
 }
 
 __m128 test_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_andnot_ps
+  // CHECK-LABEL: test_mm_mask_andnot_ps
   // CHECK: xor <4 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
@@ -100,7 +104,7 @@ __m128 test_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 __m128 test_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_andnot_ps
+  // CHECK-LABEL: test_mm_maskz_andnot_ps
   // CHECK: xor <4 x i32> %{{.*}}, splat (i32 -1)
   // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
@@ -108,776 +112,776 @@ __m128 test_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) {
 }
 
 __m256d test_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_and_pd
+  // CHECK-LABEL: test_mm256_mask_and_pd
   // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_and_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_and_pd
+  // CHECK-LABEL: test_mm256_maskz_and_pd
   // CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_and_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_and_pd
+  // CHECK-LABEL: test_mm_mask_and_pd
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_and_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_and_pd
+  // CHECK-LABEL: test_mm_maskz_and_pd
   // CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_and_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_and_ps
+  // CHECK-LABEL: test_mm256_mask_and_ps
   // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_and_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_maskz_and_ps
+  // CHECK-LABEL: test_mm256_maskz_and_ps
   // CHECK: and <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_and_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_and_ps
+  // CHECK-LABEL: test_mm_mask_and_ps
   // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_and_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_and_ps
+  // CHECK-LABEL: test_mm_maskz_and_ps
   // CHECK: and <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_and_ps (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_xor_pd
+  // CHECK-LABEL: test_mm256_mask_xor_pd
   // CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_xor_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_xor_pd
+  // CHECK-LABEL: test_mm256_maskz_xor_pd
   // CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_xor_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_xor_pd
+  // CHECK-LABEL: test_mm_mask_xor_pd
   // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_xor_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_xor_pd
+  // CHECK-LABEL: test_mm_maskz_xor_pd
   // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_xor_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_xor_ps
+  // CHECK-LABEL: test_mm256_mask_xor_ps
   // CHECK: xor <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_xor_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_maskz_xor_ps
+  // CHECK-LABEL: test_mm256_maskz_xor_ps
   // CHECK: xor <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_xor_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_xor_ps
+  // CHECK-LABEL: test_mm_mask_xor_ps
   // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_xor_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_xor_ps
+  // CHECK-LABEL: test_mm_maskz_xor_ps
   // CHECK: xor <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_xor_ps (__U, __A, __B);
 }
 
 __m256d test_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_or_pd
+  // CHECK-LABEL: test_mm256_mask_or_pd
   // CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_mask_or_pd ( __W, __U, __A, __B);
 }
 
 __m256d test_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_or_pd
+  // CHECK-LABEL: test_mm256_maskz_or_pd
   // CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return (__m256d) _mm256_maskz_or_pd (__U, __A, __B);
 }
 
 __m128d test_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_or_pd
+  // CHECK-LABEL: test_mm_mask_or_pd
   // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_mask_or_pd ( __W, __U, __A, __B);
 }
 
 __m128d test_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_or_pd
+  // CHECK-LABEL: test_mm_maskz_or_pd
   // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return (__m128d) _mm_maskz_or_pd (__U, __A, __B);
 }
 
 __m256 test_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_or_ps
+  // CHECK-LABEL: test_mm256_mask_or_ps
   // CHECK: or <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_mask_or_ps ( __W, __U, __A, __B);
 }
 
 __m256 test_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_maskz_or_ps
+  // CHECK-LABEL: test_mm256_maskz_or_ps
   // CHECK: or <8 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return (__m256) _mm256_maskz_or_ps (__U, __A, __B);
 }
 
 __m128 test_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_or_ps
+  // CHECK-LABEL: test_mm_mask_or_ps
   // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_mask_or_ps ( __W, __U, __A, __B);
 }
 
 __m128 test_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_or_ps
+  // CHECK-LABEL: test_mm_maskz_or_ps
   // CHECK: or <4 x i32> %{{.*}}, %{{.*}}
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return (__m128) _mm_maskz_or_ps(__U, __A, __B);
 }
 
 __m128i test_mm_cvtpd_epi64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtpd_epi64
+  // CHECK-LABEL: test_mm_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.128
   return _mm_cvtpd_epi64(__A); 
 }
 
 __m128i test_mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtpd_epi64
+  // CHECK-LABEL: test_mm_mask_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.128
   return _mm_mask_cvtpd_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtpd_epi64
+  // CHECK-LABEL: test_mm_maskz_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.128
   return _mm_maskz_cvtpd_epi64(__U, __A); 
 }
 
 __m256i test_mm256_cvtpd_epi64(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_cvtpd_epi64
+  // CHECK-LABEL: test_mm256_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.256
   return _mm256_cvtpd_epi64(__A); 
 }
 
 __m256i test_mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtpd_epi64
+  // CHECK-LABEL: test_mm256_mask_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.256
   return _mm256_mask_cvtpd_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtpd_epi64
+  // CHECK-LABEL: test_mm256_maskz_cvtpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2qq.256
   return _mm256_maskz_cvtpd_epi64(__U, __A); 
 }
 
 __m128i test_mm_cvtpd_epu64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtpd_epu64
+  // CHECK-LABEL: test_mm_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.128
   return _mm_cvtpd_epu64(__A); 
 }
 
 __m128i test_mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtpd_epu64
+  // CHECK-LABEL: test_mm_mask_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.128
   return _mm_mask_cvtpd_epu64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtpd_epu64
+  // CHECK-LABEL: test_mm_maskz_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.128
   return _mm_maskz_cvtpd_epu64(__U, __A); 
 }
 
 __m256i test_mm256_cvtpd_epu64(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_cvtpd_epu64
+  // CHECK-LABEL: test_mm256_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.256
   return _mm256_cvtpd_epu64(__A); 
 }
 
 __m256i test_mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtpd_epu64
+  // CHECK-LABEL: test_mm256_mask_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.256
   return _mm256_mask_cvtpd_epu64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtpd_epu64
+  // CHECK-LABEL: test_mm256_maskz_cvtpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtpd2uqq.256
   return _mm256_maskz_cvtpd_epu64(__U, __A); 
 }
 
 __m128i test_mm_cvtps_epi64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtps_epi64
+  // CHECK-LABEL: test_mm_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.128
   return _mm_cvtps_epi64(__A); 
 }
 
 __m128i test_mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtps_epi64
+  // CHECK-LABEL: test_mm_mask_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.128
   return _mm_mask_cvtps_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtps_epi64
+  // CHECK-LABEL: test_mm_maskz_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.128
   return _mm_maskz_cvtps_epi64(__U, __A); 
 }
 
 __m256i test_mm256_cvtps_epi64(__m128 __A) {
-  // CHECK-LABEL: @test_mm256_cvtps_epi64
+  // CHECK-LABEL: test_mm256_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.256
   return _mm256_cvtps_epi64(__A); 
 }
 
 __m256i test_mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtps_epi64
+  // CHECK-LABEL: test_mm256_mask_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.256
   return _mm256_mask_cvtps_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtps_epi64
+  // CHECK-LABEL: test_mm256_maskz_cvtps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvtps2qq.256
   return _mm256_maskz_cvtps_epi64(__U, __A); 
 }
 
 __m128i test_mm_cvtps_epu64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtps_epu64
+  // CHECK-LABEL: test_mm_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.128
   return _mm_cvtps_epu64(__A); 
 }
 
 __m128i test_mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtps_epu64
+  // CHECK-LABEL: test_mm_mask_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.128
   return _mm_mask_cvtps_epu64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtps_epu64
+  // CHECK-LABEL: test_mm_maskz_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.128
   return _mm_maskz_cvtps_epu64(__U, __A); 
 }
 
 __m256i test_mm256_cvtps_epu64(__m128 __A) {
-  // CHECK-LABEL: @test_mm256_cvtps_epu64
+  // CHECK-LABEL: test_mm256_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.256
   return _mm256_cvtps_epu64(__A); 
 }
 
 __m256i test_mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtps_epu64
+  // CHECK-LABEL: test_mm256_mask_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.256
   return _mm256_mask_cvtps_epu64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtps_epu64
+  // CHECK-LABEL: test_mm256_maskz_cvtps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvtps2uqq.256
   return _mm256_maskz_cvtps_epu64(__U, __A); 
 }
 
 __m128d test_mm_cvtepi64_pd(__m128i __A) {
-  // CHECK-LABEL: @test_mm_cvtepi64_pd
+  // CHECK-LABEL: test_mm_cvtepi64_pd
   // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
   return _mm_cvtepi64_pd(__A); 
 }
 
 __m128d test_mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepi64_pd
+  // CHECK-LABEL: test_mm_mask_cvtepi64_pd
   // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m128d test_mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepi64_pd
+  // CHECK-LABEL: test_mm_maskz_cvtepi64_pd
   // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_cvtepi64_pd(__U, __A); 
 }
 
 __m256d test_mm256_cvtepi64_pd(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_cvtepi64_pd
+  // CHECK-LABEL: test_mm256_cvtepi64_pd
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
   return _mm256_cvtepi64_pd(__A); 
 }
 
 __m256d test_mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepi64_pd
+  // CHECK-LABEL: test_mm256_mask_cvtepi64_pd
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_cvtepi64_pd(__W, __U, __A); 
 }
 
 __m256d test_mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_pd
+  // CHECK-LABEL: test_mm256_maskz_cvtepi64_pd
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x double>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_cvtepi64_pd(__U, __A); 
 }
 
 __m128 test_mm_cvtepi64_ps(__m128i __A) {
-  // CHECK-LABEL: @test_mm_cvtepi64_ps
+  // CHECK-LABEL: test_mm_cvtepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.128
   return _mm_cvtepi64_ps(__A); 
 }
 
 __m128 test_mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepi64_ps
+  // CHECK-LABEL: test_mm_mask_cvtepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.128
   return _mm_mask_cvtepi64_ps(__W, __U, __A); 
 }
 
 __m128 test_mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepi64_ps
+  // CHECK-LABEL: test_mm_maskz_cvtepi64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtqq2ps.128
   return _mm_maskz_cvtepi64_ps(__U, __A); 
 }
 
 __m128 test_mm256_cvtepi64_ps(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_cvtepi64_ps
+  // CHECK-LABEL: test_mm256_cvtepi64_ps
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x float>
   return _mm256_cvtepi64_ps(__A); 
 }
 
 __m128 test_mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepi64_ps
+  // CHECK-LABEL: test_mm256_mask_cvtepi64_ps
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x float>
   // select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_cvtepi64_ps(__W, __U, __A); 
 }
 
 __m128 test_mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_ps
+  // CHECK-LABEL: test_mm256_maskz_cvtepi64_ps
   // CHECK: sitofp <4 x i64> %{{.*}} to <4 x float>
   // select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_cvtepi64_ps(__U, __A); 
 }
 
 __m128i test_mm_cvttpd_epi64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttpd_epi64
+  // CHECK-LABEL: test_mm_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.128
   return _mm_cvttpd_epi64(__A); 
 }
 
 __m128i test_mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_cvttpd_epi64
+  // CHECK-LABEL: test_mm_mask_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.128
   return _mm_mask_cvttpd_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvttpd_epi64
+  // CHECK-LABEL: test_mm_maskz_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.128
   return _mm_maskz_cvttpd_epi64(__U, __A); 
 }
 
 __m256i test_mm256_cvttpd_epi64(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_cvttpd_epi64
+  // CHECK-LABEL: test_mm256_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.256
   return _mm256_cvttpd_epi64(__A); 
 }
 
 __m256i test_mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvttpd_epi64
+  // CHECK-LABEL: test_mm256_mask_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.256
   return _mm256_mask_cvttpd_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvttpd_epi64
+  // CHECK-LABEL: test_mm256_maskz_cvttpd_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2qq.256
   return _mm256_maskz_cvttpd_epi64(__U, __A); 
 }
 
 __m128i test_mm_cvttpd_epu64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttpd_epu64
+  // CHECK-LABEL: test_mm_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.128
   return _mm_cvttpd_epu64(__A); 
 }
 
 __m128i test_mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_cvttpd_epu64
+  // CHECK-LABEL: test_mm_mask_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.128
   return _mm_mask_cvttpd_epu64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvttpd_epu64
+  // CHECK-LABEL: test_mm_maskz_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.128
   return _mm_maskz_cvttpd_epu64(__U, __A); 
 }
 
 __m256i test_mm256_cvttpd_epu64(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_cvttpd_epu64
+  // CHECK-LABEL: test_mm256_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.256
   return _mm256_cvttpd_epu64(__A); 
 }
 
 __m256i test_mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvttpd_epu64
+  // CHECK-LABEL: test_mm256_mask_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.256
   return _mm256_mask_cvttpd_epu64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvttpd_epu64
+  // CHECK-LABEL: test_mm256_maskz_cvttpd_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttpd2uqq.256
   return _mm256_maskz_cvttpd_epu64(__U, __A); 
 }
 
 __m128i test_mm_cvttps_epi64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttps_epi64
+  // CHECK-LABEL: test_mm_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.128
   return _mm_cvttps_epi64(__A); 
 }
 
 __m128i test_mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_cvttps_epi64
+  // CHECK-LABEL: test_mm_mask_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.128
   return _mm_mask_cvttps_epi64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvttps_epi64
+  // CHECK-LABEL: test_mm_maskz_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.128
   return _mm_maskz_cvttps_epi64(__U, __A); 
 }
 
 __m256i test_mm256_cvttps_epi64(__m128 __A) {
-  // CHECK-LABEL: @test_mm256_cvttps_epi64
+  // CHECK-LABEL: test_mm256_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.256
   return _mm256_cvttps_epi64(__A); 
 }
 
 __m256i test_mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvttps_epi64
+  // CHECK-LABEL: test_mm256_mask_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.256
   return _mm256_mask_cvttps_epi64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvttps_epi64
+  // CHECK-LABEL: test_mm256_maskz_cvttps_epi64
   // CHECK: @llvm.x86.avx512.mask.cvttps2qq.256
   return _mm256_maskz_cvttps_epi64(__U, __A); 
 }
 
 __m128i test_mm_cvttps_epu64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttps_epu64
+  // CHECK-LABEL: test_mm_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.128
   return _mm_cvttps_epu64(__A); 
 }
 
 __m128i test_mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_cvttps_epu64
+  // CHECK-LABEL: test_mm_mask_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.128
   return _mm_mask_cvttps_epu64(__W, __U, __A); 
 }
 
 __m128i test_mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvttps_epu64
+  // CHECK-LABEL: test_mm_maskz_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.128
   return _mm_maskz_cvttps_epu64(__U, __A); 
 }
 
 __m256i test_mm256_cvttps_epu64(__m128 __A) {
-  // CHECK-LABEL: @test_mm256_cvttps_epu64
+  // CHECK-LABEL: test_mm256_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.256
   return _mm256_cvttps_epu64(__A); 
 }
 
 __m256i test_mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvttps_epu64
+  // CHECK-LABEL: test_mm256_mask_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.256
   return _mm256_mask_cvttps_epu64(__W, __U, __A); 
 }
 
 __m256i test_mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvttps_epu64
+  // CHECK-LABEL: test_mm256_maskz_cvttps_epu64
   // CHECK: @llvm.x86.avx512.mask.cvttps2uqq.256
   return _mm256_maskz_cvttps_epu64(__U, __A); 
 }
 
 __m128d test_mm_cvtepu64_pd(__m128i __A) {
-  // CHECK-LABEL: @test_mm_cvtepu64_pd
+  // CHECK-LABEL: test_mm_cvtepu64_pd
   // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
   return _mm_cvtepu64_pd(__A); 
 }
 
 __m128d test_mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu64_pd
+  // CHECK-LABEL: test_mm_mask_cvtepu64_pd
   // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m128d test_mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu64_pd
+  // CHECK-LABEL: test_mm_maskz_cvtepu64_pd
   // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_cvtepu64_pd(__U, __A); 
 }
 
 __m256d test_mm256_cvtepu64_pd(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_cvtepu64_pd
+  // CHECK-LABEL: test_mm256_cvtepu64_pd
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
   return _mm256_cvtepu64_pd(__A); 
 }
 
 __m256d test_mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu64_pd
+  // CHECK-LABEL: test_mm256_mask_cvtepu64_pd
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_cvtepu64_pd(__W, __U, __A); 
 }
 
 __m256d test_mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu64_pd
+  // CHECK-LABEL: test_mm256_maskz_cvtepu64_pd
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x double>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_cvtepu64_pd(__U, __A); 
 }
 
 __m128 test_mm_cvtepu64_ps(__m128i __A) {
-  // CHECK-LABEL: @test_mm_cvtepu64_ps
+  // CHECK-LABEL: test_mm_cvtepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.128
   return _mm_cvtepu64_ps(__A); 
 }
 
 __m128 test_mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_cvtepu64_ps
+  // CHECK-LABEL: test_mm_mask_cvtepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.128
   return _mm_mask_cvtepu64_ps(__W, __U, __A); 
 }
 
 __m128 test_mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_cvtepu64_ps
+  // CHECK-LABEL: test_mm_maskz_cvtepu64_ps
   // CHECK: @llvm.x86.avx512.mask.cvtuqq2ps.128
   return _mm_maskz_cvtepu64_ps(__U, __A); 
 }
 
 __m128 test_mm256_cvtepu64_ps(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_cvtepu64_ps
+  // CHECK-LABEL: test_mm256_cvtepu64_ps
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x float>
   return _mm256_cvtepu64_ps(__A); 
 }
 
 __m128 test_mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_cvtepu64_ps
+  // CHECK-LABEL: test_mm256_mask_cvtepu64_ps
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x float>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_cvtepu64_ps(__W, __U, __A); 
 }
 
 __m128 test_mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_cvtepu64_ps
+  // CHECK-LABEL: test_mm256_maskz_cvtepu64_ps
   // CHECK: uitofp <4 x i64> %{{.*}} to <4 x float>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_cvtepu64_ps(__U, __A); 
 }
 
 __m128d test_mm_range_pd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_range_pd
+  // CHECK-LABEL: test_mm_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.128
   return _mm_range_pd(__A, __B, 4); 
 }
 
 __m128d test_mm_mask_range_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_range_pd
+  // CHECK-LABEL: test_mm_mask_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.128
   return _mm_mask_range_pd(__W, __U, __A, __B, 4); 
 }
 
 __m128d test_mm_maskz_range_pd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_range_pd
+  // CHECK-LABEL: test_mm_maskz_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.128
   return _mm_maskz_range_pd(__U, __A, __B, 4); 
 }
 
 __m256d test_mm256_range_pd(__m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_range_pd
+  // CHECK-LABEL: test_mm256_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.256
   return _mm256_range_pd(__A, __B, 4); 
 }
 
 __m256d test_mm256_mask_range_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_mask_range_pd
+  // CHECK-LABEL: test_mm256_mask_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.256
   return _mm256_mask_range_pd(__W, __U, __A, __B, 4); 
 }
 
 __m256d test_mm256_maskz_range_pd(__mmask8 __U, __m256d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_range_pd
+  // CHECK-LABEL: test_mm256_maskz_range_pd
   // CHECK: @llvm.x86.avx512.mask.range.pd.256
   return _mm256_maskz_range_pd(__U, __A, __B, 4); 
 }
 
 __m128 test_mm_range_ps(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_range_ps
+  // CHECK-LABEL: test_mm_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.128
   return _mm_range_ps(__A, __B, 4); 
 }
 
 __m128 test_mm_mask_range_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_range_ps
+  // CHECK-LABEL: test_mm_mask_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.128
   return _mm_mask_range_ps(__W, __U, __A, __B, 4); 
 }
 
 __m128 test_mm_maskz_range_ps(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_range_ps
+  // CHECK-LABEL: test_mm_maskz_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.128
   return _mm_maskz_range_ps(__U, __A, __B, 4); 
 }
 
 __m256 test_mm256_range_ps(__m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_range_ps
+  // CHECK-LABEL: test_mm256_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.256
   return _mm256_range_ps(__A, __B, 4); 
 }
 
 __m256 test_mm256_mask_range_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_mask_range_ps
+  // CHECK-LABEL: test_mm256_mask_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.256
   return _mm256_mask_range_ps(__W, __U, __A, __B, 4); 
 }
 
 __m256 test_mm256_maskz_range_ps(__mmask8 __U, __m256 __A, __m256 __B) {
-  // CHECK-LABEL: @test_mm256_maskz_range_ps
+  // CHECK-LABEL: test_mm256_maskz_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.256
   return _mm256_maskz_range_ps(__U, __A, __B, 4); 
 }
 
 __m128d test_mm_reduce_pd(__m128d __A) {
-  // CHECK-LABEL: @test_mm_reduce_pd
+  // CHECK-LABEL: test_mm_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.128
   return _mm_reduce_pd(__A, 4); 
 }
 
 __m128d test_mm_mask_reduce_pd(__m128d __W, __mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_reduce_pd
+  // CHECK-LABEL: test_mm_mask_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.128
   return _mm_mask_reduce_pd(__W, __U, __A, 4); 
 }
 
 __m128d test_mm_maskz_reduce_pd(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_pd
+  // CHECK-LABEL: test_mm_maskz_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.128
   return _mm_maskz_reduce_pd(__U, __A, 4); 
 }
 
 __m256d test_mm256_reduce_pd(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_reduce_pd
+  // CHECK-LABEL: test_mm256_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.256
   return _mm256_reduce_pd(__A, 4); 
 }
 
 __m256d test_mm256_mask_reduce_pd(__m256d __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_reduce_pd
+  // CHECK-LABEL: test_mm256_mask_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.256
   return _mm256_mask_reduce_pd(__W, __U, __A, 4); 
 }
 
 __m256d test_mm256_maskz_reduce_pd(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_reduce_pd
+  // CHECK-LABEL: test_mm256_maskz_reduce_pd
   // CHECK: @llvm.x86.avx512.mask.reduce.pd.256
   return _mm256_maskz_reduce_pd(__U, __A, 4); 
 }
 
 __m128 test_mm_reduce_ps(__m128 __A) {
-  // CHECK-LABEL: @test_mm_reduce_ps
+  // CHECK-LABEL: test_mm_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.128
   return _mm_reduce_ps(__A, 4); 
 }
 
 __m128 test_mm_mask_reduce_ps(__m128 __W, __mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_reduce_ps
+  // CHECK-LABEL: test_mm_mask_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.128
   return _mm_mask_reduce_ps(__W, __U, __A, 4); 
 }
 
 __m128 test_mm_maskz_reduce_ps(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_ps
+  // CHECK-LABEL: test_mm_maskz_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.128
   return _mm_maskz_reduce_ps(__U, __A, 4); 
 }
 
 __m256 test_mm256_reduce_ps(__m256 __A) {
-  // CHECK-LABEL: @test_mm256_reduce_ps
+  // CHECK-LABEL: test_mm256_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.256
   return _mm256_reduce_ps(__A, 4); 
 }
 
 __m256 test_mm256_mask_reduce_ps(__m256 __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm256_mask_reduce_ps
+  // CHECK-LABEL: test_mm256_mask_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.256
   return _mm256_mask_reduce_ps(__W, __U, __A, 4); 
 }
 
 __m256 test_mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_reduce_ps
+  // CHECK-LABEL: test_mm256_maskz_reduce_ps
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.256
   return _mm256_maskz_reduce_ps(__U, __A, 4); 
 }
 
 __mmask8 test_mm_movepi32_mask(__m128i __A) {
-  // CHECK-LABEL: @test_mm_movepi32_mask
+  // CHECK-LABEL: test_mm_movepi32_mask
   // CHECK: [[CMP:%.*]] = icmp slt <4 x i32> %{{.*}}, zeroinitializer
   // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm_movepi32_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi32_mask(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_movepi32_mask
+  // CHECK-LABEL: test_mm256_movepi32_mask
   // CHECK: [[CMP:%.*]] = icmp slt <8 x i32> %{{.*}}, zeroinitializer
   return _mm256_movepi32_mask(__A); 
 }
 
 __m128i test_mm_movm_epi32(__mmask8 __A) {
-  // CHECK-LABEL: @test_mm_movm_epi32
+  // CHECK-LABEL: test_mm_movm_epi32
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: %extract.i = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: %vpmovm2.i = sext <4 x i1> %extract.i to <4 x i32>
@@ -885,14 +889,14 @@ __m128i test_mm_movm_epi32(__mmask8 __A) {
 }
 
 __m256i test_mm256_movm_epi32(__mmask8 __A) {
-  // CHECK-LABEL: @test_mm256_movm_epi32
+  // CHECK-LABEL: test_mm256_movm_epi32
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: %vpmovm2.i = sext <8 x i1> %{{.*}} to <8 x i32>
   return _mm256_movm_epi32(__A); 
 }
 
 __m128i test_mm_movm_epi64(__mmask8 __A) {
-  // CHECK-LABEL: @test_mm_movm_epi64
+  // CHECK-LABEL: test_mm_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: %extract.i = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: %vpmovm2.i = sext <2 x i1> %extract.i to <2 x i64>
@@ -900,7 +904,7 @@ __m128i test_mm_movm_epi64(__mmask8 __A) {
 }
 
 __m256i test_mm256_movm_epi64(__mmask8 __A) {
-  // CHECK-LABEL: @test_mm256_movm_epi64
+  // CHECK-LABEL: test_mm256_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
   // CHECK: %extract.i = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: %vpmovm2.i = sext <4 x i1> %extract.i to <4 x i64>
@@ -908,14 +912,14 @@ __m256i test_mm256_movm_epi64(__mmask8 __A) {
 }
 
 __mmask8 test_mm_movepi64_mask(__m128i __A) {
-  // CHECK-LABEL: @test_mm_movepi64_mask
+  // CHECK-LABEL: test_mm_movepi64_mask
   // CHECK: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer
   // CHECK: [[SHUF:%.*]] = shufflevector <2 x i1> [[CMP]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   return _mm_movepi64_mask(__A); 
 }
 
 __mmask8 test_mm256_movepi64_mask(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_movepi64_mask
+  // CHECK-LABEL: test_mm256_movepi64_mask
   // CHECK: [[CMP:%.*]] = icmp slt <4 x i64> %{{.*}}, zeroinitializer
   // CHECK: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_movepi64_mask(__A); 
@@ -923,229 +927,234 @@ __mmask8 test_mm256_movepi64_mask(__m256i __A) {
 
 
 __m256 test_mm256_broadcast_f32x2(__m128 __A) {
-  // CHECK-LABEL: @test_mm256_broadcast_f32x2
+  // CHECK-LABEL: test_mm256_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_f32x2(__A); 
 }
+TEST_CONSTEXPR(match_m256(_mm256_broadcast_f32x2((__m128)(__v4sf){1.0f, -2.0f, 3.0f, -4.0f}), 1.0f, -2.0f, 1.0f, -2.0f, 1.0f, -2.0f, 1.0f, -2.0f));
 
 __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_mask_broadcast_f32x2
+  // CHECK-LABEL: test_mm256_mask_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
 __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x2
+  // CHECK-LABEL: test_mm256_maskz_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_broadcast_f32x2(__M, __A); 
 }
 
 __m256d test_mm256_broadcast_f64x2(double const* __A) {
-  // CHECK-LABEL: @test_mm256_broadcast_f64x2
+  // CHECK-LABEL: test_mm256_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_f64x2(_mm_loadu_pd(__A)); 
 }
+TEST_CONSTEXPR(match_m256d(_mm256_broadcast_f64x2((__m128d)(__v2df){1.0, -2.0}), 1.0, -2.0, 1.0, -2.0));
 
 __m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm256_mask_broadcast_f64x2
+  // CHECK-LABEL: test_mm256_mask_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
 __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm256_maskz_broadcast_f64x2
+  // CHECK-LABEL: test_mm256_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
 __m128i test_mm_broadcast_i32x2(__m128i __A) {
-  // CHECK-LABEL: @test_mm_broadcast_i32x2
+  // CHECK-LABEL: test_mm_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm_broadcast_i32x2(__A); 
 }
+TEST_CONSTEXPR(match_v4si(_mm_broadcast_i32x2((__m128i)(__v4si){1, -2, 3, -4}), 1, -2, 1, -2));
 
 __m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm_mask_broadcast_i32x2
+  // CHECK-LABEL: test_mm_mask_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm_maskz_broadcast_i32x2
+  // CHECK-LABEL: test_mm_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_broadcast_i32x2(__M, __A); 
 }
 
 __m256i test_mm256_broadcast_i32x2(__m128i __A) {
-  // CHECK-LABEL: @test_mm256_broadcast_i32x2
+  // CHECK-LABEL: test_mm256_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_i32x2(__A); 
 }
+TEST_CONSTEXPR(match_v8si(_mm256_broadcast_i32x2((__m128i)(__v4si){1, -2, 3, -4}), 1, -2, 1, -2, 1, -2, 1, -2));
 
 __m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_mask_broadcast_i32x2
+  // CHECK-LABEL: test_mm256_mask_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
 __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x2
+  // CHECK-LABEL: test_mm256_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_broadcast_i32x2(__M, __A); 
 }
 
 __m256i test_mm256_broadcast_i64x2(__m128i const* __A) {
-  // CHECK-LABEL: @test_mm256_broadcast_i64x2
+  // CHECK-LABEL: test_mm256_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_i64x2(_mm_loadu_si128(__A)); 
 }
+TEST_CONSTEXPR(match_v4di(_mm256_broadcast_i64x2((__m128i)(__v2di){1, -2}), 1, -2, 1, -2));
 
 __m256i test_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm256_mask_broadcast_i64x2
+  // CHECK-LABEL: test_mm256_mask_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_broadcast_i64x2(__O, __M, _mm_loadu_si128(__A)); 
 }
 
 __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm256_maskz_broadcast_i64x2
+  // CHECK-LABEL: test_mm256_maskz_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); 
 }
 
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_extractf64x2_pd
+  // CHECK-LABEL: test_mm256_extractf64x2_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf64x2_pd(__A, 1); 
 }
 
 __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
+  // CHECK-LABEL: test_mm256_mask_extractf64x2_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
 }
 
 __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
+  // CHECK-LABEL: test_mm256_maskz_extractf64x2_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
-  // CHECK-LABEL: @test_mm256_extracti64x2_epi64
+  // CHECK-LABEL: test_mm256_extracti64x2_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti64x2_epi64(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
+  // CHECK-LABEL: test_mm256_mask_extracti64x2_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
+  // CHECK-LABEL: test_mm256_maskz_extracti64x2_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
 }
 
 __m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm256_insertf64x2
+  // CHECK-LABEL: test_mm256_insertf64x2
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_insertf64x2(__A, __B, 1); 
 }
 
 __m256d test_mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm256_mask_insertf64x2
+  // CHECK-LABEL: test_mm256_mask_insertf64x2
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_insertf64x2(__W, __U, __A, __B, 1); 
 }
 
 __m256d test_mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm256_maskz_insertf64x2
+  // CHECK-LABEL: test_mm256_maskz_insertf64x2
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_insertf64x2(__U, __A, __B, 1); 
 }
 
 __m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm256_inserti64x2
+  // CHECK-LABEL: test_mm256_inserti64x2
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti64x2(__A, __B, 1); 
 }
 
 __m256i test_mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm256_mask_inserti64x2
+  // CHECK-LABEL: test_mm256_mask_inserti64x2
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_inserti64x2(__W, __U, __A, __B, 1); 
 }
 
 __m256i test_mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm256_maskz_inserti64x2
+  // CHECK-LABEL: test_mm256_maskz_inserti64x2
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_inserti64x2(__U, __A, __B, 1); 
 }
 
 __mmask8 test_mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A) {
-  // CHECK-LABEL: @test_mm_mask_fpclass_pd_mask
+  // CHECK-LABEL: test_mm_mask_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.128
   return _mm_mask_fpclass_pd_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm_fpclass_pd_mask(__m128d __A) {
-  // CHECK-LABEL: @test_mm_fpclass_pd_mask
+  // CHECK-LABEL: test_mm_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.128
   return _mm_fpclass_pd_mask(__A, 2); 
 }
 
 __mmask8 test_mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A) {
-  // CHECK-LABEL: @test_mm256_mask_fpclass_pd_mask
+  // CHECK-LABEL: test_mm256_mask_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.256
   return _mm256_mask_fpclass_pd_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm256_fpclass_pd_mask(__m256d __A) {
-  // CHECK-LABEL: @test_mm256_fpclass_pd_mask
+  // CHECK-LABEL: test_mm256_fpclass_pd_mask
   // CHECK: @llvm.x86.avx512.fpclass.pd.256
   return _mm256_fpclass_pd_mask(__A, 2); 
 }
 
 __mmask8 test_mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_fpclass_ps_mask
+  // CHECK-LABEL: test_mm_mask_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.128
   return _mm_mask_fpclass_ps_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm_fpclass_ps_mask(__m128 __A) {
-  // CHECK-LABEL: @test_mm_fpclass_ps_mask
+  // CHECK-LABEL: test_mm_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.128
   return _mm_fpclass_ps_mask(__A, 2); 
 }
 
 __mmask8 test_mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm256_mask_fpclass_ps_mask
+  // CHECK-LABEL: test_mm256_mask_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.256
   return _mm256_mask_fpclass_ps_mask(__U, __A, 2); 
 }
 
 __mmask8 test_mm256_fpclass_ps_mask(__m256 __A) {
-  // CHECK-LABEL: @test_mm256_fpclass_ps_mask
+  // CHECK-LABEL: test_mm256_fpclass_ps_mask
   // CHECK: @llvm.x86.avx512.fpclass.ps.256
   return _mm256_fpclass_ps_mask(__A, 2); 
 }
diff --git a/clang/test/CodeGen/X86/builtin_test_helpers.h b/clang/test/CodeGen/X86/builtin_test_helpers.h
index b83ca4d..6541ca4 100644
--- a/clang/test/CodeGen/X86/builtin_test_helpers.h
+++ b/clang/test/CodeGen/X86/builtin_test_helpers.h
@@ -28,8 +28,8 @@ constexpr bool match_v4hu(__m64 _v, unsigned short a, unsigned short b, unsigned
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
 }
 
-constexpr bool match_v8qi(__m64 _v, char a, char b, char c, char d, char e, char f, char g, char h) {
-  __v8qi v = (__v8qi)_v;
+constexpr bool match_v8qi(__m64 _v, signed char a, signed char b, signed char c, signed char d, signed char e, signed char f, signed char g, signed char h) {
+  __v8qs v = (__v8qs)_v;
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
 }
 
@@ -38,17 +38,20 @@ constexpr bool match_v8qu(__m64 _v, unsigned char a, unsigned char b, unsigned c
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
 }
 
-constexpr bool match_m128(__m128 v, float a, float b, float c, float d) {
-  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+constexpr bool match_m128(__m128 _v, float a, float b, float c, float d) {
+  __v4su v = (__v4su)_v;
+  return v[0] == __builtin_bit_cast(unsigned, a) && v[1] == __builtin_bit_cast(unsigned, b) && v[2] == __builtin_bit_cast(unsigned, c) && v[3] == __builtin_bit_cast(unsigned, d);
 }
 
-constexpr bool match_m128d(__m128d v, double a, double b) {
-  return v[0] == a && v[1] == b;
+constexpr bool match_m128d(__m128d _v, double a, double b) {
+  __v2du v = (__v2du)_v;
+  return v[0] == __builtin_bit_cast(unsigned long long, a) && v[1] == __builtin_bit_cast(unsigned long long, b);
 }
 
 constexpr bool match_m128h(__m128h _v, _Float16 __e00, _Float16 __e01, _Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, _Float16 __e07) {
-  __v8hf v = (__v8hf)_v;
-  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 && v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] ==  __e07;
+  __v8hu v = (__v8hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == __builtin_bit_cast(unsigned short, __e01) && v[ 2] == __builtin_bit_cast(unsigned short, __e02) && v[ 3] == __builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == __builtin_bit_cast(unsigned short, __e05) && v[ 6] == __builtin_bit_cast(unsigned short, __e06) && v[ 7] == __builtin_bit_cast(unsigned short, __e07);
 }
 
 constexpr bool match_m128i(__m128i _v, unsigned long long a, unsigned long long b) {
@@ -75,8 +78,8 @@ constexpr bool match_v8hu(__m128i _v, unsigned short a, unsigned short b, unsign
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
 }
 
-constexpr bool match_v16qi(__m128i _v, char a, char b, char c, char d, char e, char f, char g, char h, char i, char j, char k, char l, char m, char n, char o, char p) {
-  __v16qi v = (__v16qi)_v;
+constexpr bool match_v16qi(__m128i _v, signed char a, signed char b, signed char c, signed char d, signed char e, signed char f, signed char g, signed char h, signed char i, signed char j, signed char k, signed char l, signed char m, signed char n, signed char o, signed char p) {
+  __v16qs v = (__v16qs)_v;
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] == l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
 }
 
@@ -85,19 +88,24 @@ constexpr bool match_v16qu(__m128i _v, unsigned char a, unsigned char b, unsigne
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] == l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
 }
 
-constexpr bool match_m256(__m256 v, float a, float b, float c, float d, float e, float f, float g, float h) {
-  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
+constexpr bool match_m256(__m256 _v, float __e00, float __e01, float __e02, float __e03, float __e04, float __e05, float __e06, float __e07) {
+  __v8su v = (__v8su)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned, __e00) && v[ 1] == __builtin_bit_cast(unsigned, __e01) && v[ 2] == __builtin_bit_cast(unsigned, __e02) && v[ 3] == __builtin_bit_cast(unsigned, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned, __e04) && v[ 5] == __builtin_bit_cast(unsigned, __e05) && v[ 6] == __builtin_bit_cast(unsigned, __e06) && v[ 7] == __builtin_bit_cast(unsigned, __e07);
 }
 
-constexpr bool match_m256d(__m256d v, double a, double b, double c, double d) {
-  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+constexpr bool match_m256d(__m256d _v, double a, double b, double c, double d) {
+  __v4du v = (__v4du)_v;
+  return v[0] == __builtin_bit_cast(unsigned long long, a) && v[1] == __builtin_bit_cast(unsigned long long, b) && v[2] == __builtin_bit_cast(unsigned long long, c) && v[3] == __builtin_bit_cast(unsigned long long, d);
 }
 
 constexpr bool match_m256h(__m256h _v, _Float16 __e00, _Float16 __e01, _Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, _Float16 __e07,
                                        _Float16 __e08, _Float16 __e09, _Float16 __e10, _Float16 __e11, _Float16 __e12, _Float16 __e13, _Float16 __e14, _Float16 __e15) {
-  __v16hf v = (__v16hf)_v;
-  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 && v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] ==  __e07 &&
-         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 && v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] ==  __e15;
+  __v16hu v = (__v16hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == __builtin_bit_cast(unsigned short, __e01) && v[ 2] == __builtin_bit_cast(unsigned short, __e02) && v[ 3] == __builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == __builtin_bit_cast(unsigned short, __e05) && v[ 6] == __builtin_bit_cast(unsigned short, __e06) && v[ 7] == __builtin_bit_cast(unsigned short, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned short, __e08) && v[ 9] == __builtin_bit_cast(unsigned short, __e09) && v[10] == __builtin_bit_cast(unsigned short, __e10) && v[11] == __builtin_bit_cast(unsigned short, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned short, __e12) && v[13] == __builtin_bit_cast(unsigned short, __e13) && v[14] == __builtin_bit_cast(unsigned short, __e14) && v[15] == __builtin_bit_cast(unsigned short, __e15);
 }
 
 constexpr bool match_m256i(__m256i _v, unsigned long long a, unsigned long long b, unsigned long long c, unsigned long long d) {
@@ -125,11 +133,11 @@ constexpr bool match_v16hu(__m256i _v, unsigned short a, unsigned short b, unsig
   return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] == l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
 }
 
-constexpr bool match_v32qi(__m256i _v, char __b00, char __b01, char __b02, char __b03, char __b04, char __b05, char __b06, char __b07,
-                                       char __b08, char __b09, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15,
-                                       char __b16, char __b17, char __b18, char __b19, char __b20, char __b21, char __b22, char __b23,
-                                       char __b24, char __b25, char __b26, char __b27, char __b28, char __b29, char __b30, char __b31) {
-  __v32qi v = (__v32qi)_v;
+constexpr bool match_v32qi(__m256i _v, signed char __b00, signed char __b01, signed char __b02, signed char __b03, signed char __b04, signed char __b05, signed char __b06, signed char __b07,
+                                       signed char __b08, signed char __b09, signed char __b10, signed char __b11, signed char __b12, signed char __b13, signed char __b14, signed char __b15,
+                                       signed char __b16, signed char __b17, signed char __b18, signed char __b19, signed char __b20, signed char __b21, signed char __b22, signed char __b23,
+                                       signed char __b24, signed char __b25, signed char __b26, signed char __b27, signed char __b28, signed char __b29, signed char __b30, signed char __b31) {
+  __v32qs v = (__v32qs)_v;
   return v[ 0] == __b00 && v[ 1] == __b01 && v[ 2] == __b02 && v[ 3] == __b03 && v[ 4] == __b04 && v[ 5] == __b05 && v[ 6] == __b06 && v[ 7] ==  __b07 &&
          v[ 8] == __b08 && v[ 9] == __b09 && v[10] == __b10 && v[11] == __b11 && v[12] == __b12 && v[13] == __b13 && v[14] == __b14 && v[15] ==  __b15 &&
          v[16] == __b16 && v[17] == __b17 && v[18] == __b18 && v[19] == __b19 && v[20] == __b20 && v[21] == __b21 && v[22] == __b22 && v[23] ==  __b23 &&
@@ -147,23 +155,33 @@ constexpr bool match_v32qu(__m256i _v, unsigned char __b00, unsigned char __b01,
          v[24] == __b24 && v[25] == __b25 && v[26] == __b26 && v[27] == __b27 && v[28] == __b28 && v[29] == __b29 && v[30] == __b30 && v[31] ==  __b31;
 }
 
-constexpr bool match_m512(__m512 v, float a, float b, float c, float d, float e, float f, float g, float h, float i, float j, float k, float l, float m, float n, float o, float p) {
-  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] == l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+constexpr bool match_m512(__m512 _v, float __e00, float __e01, float __e02, float __e03, float __e04, float __e05, float __e06, float __e07, float __e08, float __e09, float __e10, float __e11, float __e12, float __e13, float __e14, float __e15) {
+  __v16su v = (__v16su)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned, __e00) && v[ 1] == __builtin_bit_cast(unsigned, __e01) && v[ 2] == __builtin_bit_cast(unsigned, __e02) && v[ 3] == __builtin_bit_cast(unsigned, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned, __e04) && v[ 5] == __builtin_bit_cast(unsigned, __e05) && v[ 6] == __builtin_bit_cast(unsigned, __e06) && v[ 7] == __builtin_bit_cast(unsigned, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned, __e08) && v[ 9] == __builtin_bit_cast(unsigned, __e09) && v[10] == __builtin_bit_cast(unsigned, __e10) && v[11] == __builtin_bit_cast(unsigned, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned, __e12) && v[13] == __builtin_bit_cast(unsigned, __e13) && v[14] == __builtin_bit_cast(unsigned, __e14) && v[15] == __builtin_bit_cast(unsigned, __e15);
 }
 
-constexpr bool match_m512d(__m512d v, double a, double b, double c, double d, double e, double f, double g, double h) {
-  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] == f && v[6] == g && v[7] == h;
+constexpr bool match_m512d(__m512d _v, double __e00, double __e01, double __e02, double __e03, double __e04, double __e05, double __e06, double __e07) {
+  __v8du v = (__v8du)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned long long, __e00) && v[ 1] == __builtin_bit_cast(unsigned long long, __e01) && v[ 2] == __builtin_bit_cast(unsigned long long, __e02) && v[ 3] == __builtin_bit_cast(unsigned long long, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned long long, __e04) && v[ 5] == __builtin_bit_cast(unsigned long long, __e05) && v[ 6] == __builtin_bit_cast(unsigned long long, __e06) && v[ 7] == __builtin_bit_cast(unsigned long long, __e07);
 }
 
 constexpr bool match_m512h(__m512h _v, _Float16 __e00, _Float16 __e01, _Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, _Float16 __e07,
                                        _Float16 __e08, _Float16 __e09, _Float16 __e10, _Float16 __e11, _Float16 __e12, _Float16 __e13, _Float16 __e14, _Float16 __e15,
                                        _Float16 __e16, _Float16 __e17, _Float16 __e18, _Float16 __e19, _Float16 __e20, _Float16 __e21, _Float16 __e22, _Float16 __e23,
                                        _Float16 __e24, _Float16 __e25, _Float16 __e26, _Float16 __e27, _Float16 __e28, _Float16 __e29, _Float16 __e30, _Float16 __e31) {
-  __v32hf v = (__v32hf)_v;
-  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 && v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] ==  __e07 &&
-         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 && v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] ==  __e15 &&
-         v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 && v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] ==  __e23 &&
-         v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 && v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] ==  __e31;
+  __v32hu v = (__v32hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == __builtin_bit_cast(unsigned short, __e01) && v[ 2] == __builtin_bit_cast(unsigned short, __e02) && v[ 3] == __builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == __builtin_bit_cast(unsigned short, __e05) && v[ 6] == __builtin_bit_cast(unsigned short, __e06) && v[ 7] == __builtin_bit_cast(unsigned short, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned short, __e08) && v[ 9] == __builtin_bit_cast(unsigned short, __e09) && v[10] == __builtin_bit_cast(unsigned short, __e10) && v[11] == __builtin_bit_cast(unsigned short, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned short, __e12) && v[13] == __builtin_bit_cast(unsigned short, __e13) && v[14] == __builtin_bit_cast(unsigned short, __e14) && v[15] == __builtin_bit_cast(unsigned short, __e15) &&
+         v[16] == __builtin_bit_cast(unsigned short, __e16) && v[17] == __builtin_bit_cast(unsigned short, __e17) && v[18] == __builtin_bit_cast(unsigned short, __e18) && v[19] == __builtin_bit_cast(unsigned short, __e19) &&
+         v[20] == __builtin_bit_cast(unsigned short, __e20) && v[21] == __builtin_bit_cast(unsigned short, __e21) && v[22] == __builtin_bit_cast(unsigned short, __e22) && v[23] == __builtin_bit_cast(unsigned short, __e23) &&
+         v[24] == __builtin_bit_cast(unsigned short, __e24) && v[25] == __builtin_bit_cast(unsigned short, __e25) && v[26] == __builtin_bit_cast(unsigned short, __e26) && v[27] == __builtin_bit_cast(unsigned short, __e27) &&
+         v[28] == __builtin_bit_cast(unsigned short, __e28) && v[29] == __builtin_bit_cast(unsigned short, __e29) && v[30] == __builtin_bit_cast(unsigned short, __e30) && v[31] == __builtin_bit_cast(unsigned short, __e31);
 }
 
 constexpr bool match_m512i(__m512i _v, unsigned long long a, unsigned long long b, unsigned long long c, unsigned long long d, unsigned long long e, unsigned long long f, unsigned long long g, unsigned long long h) {
@@ -203,15 +221,15 @@ constexpr bool match_v32hu(__m512i _v, unsigned short __e00, unsigned short __e0
          v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 && v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] ==  __e31;
 }
 
-constexpr bool match_v64qi(__m512i _v, char __e00, char __e01, char __e02, char __e03, char __e04, char __e05, char __e06, char __e07,
-                                       char __e08, char __e09, char __e10, char __e11, char __e12, char __e13, char __e14, char __e15,
-                                       char __e16, char __e17, char __e18, char __e19, char __e20, char __e21, char __e22, char __e23,
-                                       char __e24, char __e25, char __e26, char __e27, char __e28, char __e29, char __e30, char __e31,
-                                       char __e32, char __e33, char __e34, char __e35, char __e36, char __e37, char __e38, char __e39,
-                                       char __e40, char __e41, char __e42, char __e43, char __e44, char __e45, char __e46, char __e47,
-                                       char __e48, char __e49, char __e50, char __e51, char __e52, char __e53, char __e54, char __e55,
-                                       char __e56, char __e57, char __e58, char __e59, char __e60, char __e61, char __e62, char __e63) {
-  __v64qi v = (__v64qi)_v;
+constexpr bool match_v64qi(__m512i _v, signed char __e00, signed char __e01, signed char __e02, signed char __e03, signed char __e04, signed char __e05, signed char __e06, signed char __e07,
+                                       signed char __e08, signed char __e09, signed char __e10, signed char __e11, signed char __e12, signed char __e13, signed char __e14, signed char __e15,
+                                       signed char __e16, signed char __e17, signed char __e18, signed char __e19, signed char __e20, signed char __e21, signed char __e22, signed char __e23,
+                                       signed char __e24, signed char __e25, signed char __e26, signed char __e27, signed char __e28, signed char __e29, signed char __e30, signed char __e31,
+                                       signed char __e32, signed char __e33, signed char __e34, signed char __e35, signed char __e36, signed char __e37, signed char __e38, signed char __e39,
+                                       signed char __e40, signed char __e41, signed char __e42, signed char __e43, signed char __e44, signed char __e45, signed char __e46, signed char __e47,
+                                       signed char __e48, signed char __e49, signed char __e50, signed char __e51, signed char __e52, signed char __e53, signed char __e54, signed char __e55,
+                                       signed char __e56, signed char __e57, signed char __e58, signed char __e59, signed char __e60, signed char __e61, signed char __e62, signed char __e63) {
+  __v64qs v = (__v64qs)_v;
   return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 && v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] == __e07 &&
          v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 && v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] == __e15 &&
          v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 && v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] == __e23 &&
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 104bfea..12d9abd 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -31,7 +31,7 @@ __m128 test_mm_and_ps(__m128 A, __m128 B) {
   // CHECK: and <4 x i32>
   return _mm_and_ps(A, B);
 }
-TEST_CONSTEXPR(match_m128(_mm_and_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), -0.0f, -0.0f, +0.0f, +7.0f));
+TEST_CONSTEXPR(match_m128(_mm_and_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), +0.0f, -0.0f, +0.0f, +7.0f));
 
 __m128 test_mm_andnot_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_andnot_ps
@@ -39,7 +39,7 @@ __m128 test_mm_andnot_ps(__m128 A, __m128 B) {
   // CHECK: and <4 x i32>
   return _mm_andnot_ps(A, B);
 }
-TEST_CONSTEXPR(match_m128(_mm_andnot_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), +0.0f, +0.0f, +0.0f, +0.0f));
+TEST_CONSTEXPR(match_m128(_mm_andnot_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), +0.0f, +0.0f, -0.0f, +0.0f));
 
 __m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
   // CHECK-LABEL: test_mm_cmp_ps_eq_oq
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 612a619..49d8b39 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -108,7 +108,7 @@ __m128d test_mm_andnot_pd(__m128d A, __m128d B) {
   // CHECK: and <2 x i64>
   return _mm_andnot_pd(A, B);
 }
-TEST_CONSTEXPR(match_m128d(_mm_andnot_pd((__m128d){+1.0, -3.0}, (__m128d){+0.0, -0.0}), +0.0, -0.0));
+TEST_CONSTEXPR(match_m128d(_mm_andnot_pd((__m128d){+1.0, -3.0}, (__m128d){-0.0, +0.0}), -0.0, +0.0));
 
 __m128i test_mm_andnot_si128(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_andnot_si128
@@ -1780,24 +1780,28 @@ __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   return _mm_unpackhi_epi8(A, B);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_unpackhi_epi8((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v16qi){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31));
 
 __m128i test_mm_unpackhi_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpackhi_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   return _mm_unpackhi_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_unpackhi_epi16((__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v8hi){8, 9, 10, 11, 12, 13, 14, 15}), 4, 12, 5, 13, 6, 14, 7, 15));
 
 __m128i test_mm_unpackhi_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpackhi_epi32
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_epi32(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_unpackhi_epi32((__m128i)(__v4si){0, 1, 2, 3}, (__m128i)(__v4si){ 4, 5, 6, 7}), 2, 6, 3, 7));
 
 __m128i test_mm_unpackhi_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpackhi_epi64
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_epi64(A, B);
 }
+TEST_CONSTEXPR(match_v2di(_mm_unpackhi_epi64((__m128i)(__v2di){0, 1}, (__m128i)(__v2di){2, 3}), 1, 3));
 
 __m128d test_mm_unpackhi_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_unpackhi_pd
@@ -1811,24 +1815,28 @@ __m128i test_mm_unpacklo_epi8(__m128i A, __m128i B) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   return _mm_unpacklo_epi8(A, B);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_unpacklo_epi8((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v16qi){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23));
 
 __m128i test_mm_unpacklo_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpacklo_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   return _mm_unpacklo_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_unpacklo_epi16((__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v8hi){8, 9, 10, 11, 12, 13, 14, 15}), 0, 8, 1, 9, 2, 10, 3, 11));
 
 __m128i test_mm_unpacklo_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpacklo_epi32
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_epi32(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_unpacklo_epi32((__m128i)(__v4si){0, 1, 2, 3}, (__m128i)(__v4si){ 4, 5, 6, 7}), 0, 4, 1, 5));
 
 __m128i test_mm_unpacklo_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpacklo_epi64
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_epi64(A, B);
 }
+TEST_CONSTEXPR(match_v2di(_mm_unpacklo_epi64((__m128i)(__v2di){0, 1}, (__m128i)(__v2di){2, 3}), 0, 2));
 
 __m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_unpacklo_pd
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 982c74c..ee44868 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -1,7 +1,11 @@
 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 
 #include <immintrin.h>
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index bb5d035..e37e535 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -66,7 +66,7 @@ void test_builtin_elementwise_abs(float f1, float f2, double d1, double d2,
   // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
   b = __builtin_elementwise_abs(int_as_one);
 
-  // CHECK:   call i32 @llvm.abs.i32(i32 -10, i1 false)
+  // CHECK:   store i32 %elt.abs11, ptr @b, align 4
   b = __builtin_elementwise_abs(-10);
 
   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
diff --git a/clang/test/CodeGen/c-strings.c b/clang/test/CodeGen/c-strings.c
index 12f7c42..31c438f 100644
--- a/clang/test/CodeGen/c-strings.c
+++ b/clang/test/CodeGen/c-strings.c
@@ -15,8 +15,8 @@
 // MSABI: @f4.x = internal global %struct.s { ptr @"??_C@_05CJBACGMB@hello?$AA@" }
 // CHECK: @x = {{(dso_local )?}}global [3 x i8] c"ola", align [[ALIGN]]
 
-// XFAIL: target=aarch64-pc-windows-{{.*}}, target=arm64ec-pc-windows-{{.*}}
-// Arm64 aligns arrays to either 32-bit or 64-bit boundaries, which fails
+// XFAIL: target=aarch64-{{.*}}-windows-msvc, target=arm64ec-{{.*}}-windows-msvc
+// Arm64 in MSVC mode aligns arrays to either 32-bit or 64-bit boundaries, which fails
 // various checks above, since ALIGN is derived from the alignment of a single
 // i8, which is still 1.
 
diff --git a/clang/test/CodeGen/ptrauth-qualifier-blocks.c b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
index 62da59c..f460da2 100644
--- a/clang/test/CodeGen/ptrauth-qualifier-blocks.c
+++ b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
@@ -82,9 +82,15 @@ void test_block_address_byref_capture() {
   // CHECK: store i32 33554432,
   // CHECK: store i32 48,
   // CHECK: [[COPY_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 4
-  // CHECK: store ptr @__Block_byref_object_copy_, ptr [[COPY_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[COPY_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_copy_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[COPY_HELPER_FIELD]], align
   // CHECK: [[DISPOSE_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 5
-  // CHECK: store ptr @__Block_byref_object_dispose_, ptr [[DISPOSE_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[DISPOSE_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_dispose_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[DISPOSE_HELPER_FIELD]], align
   //   flags - copy/dispose required
   // CHECK: store i32 1107296256, ptr
   __block struct A * __ptrauth(1, 1, 60) ptr = createA();
diff --git a/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
new file mode 100644
index 0000000..559cddf
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -triple arm64e-apple-ios  -emit-llvm -o - %s | FileCheck %s
+
+_Static_assert(__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should set ptrauth_signed_block_descriptors");
+
+void a() {
+  // Test out a global block.
+  void (^blk)(void) = ^{};
+}
+
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+
+
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr @"__block_descriptor_32_e5_v8\01?0l", i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
+
+void b(int p) {
+  // CHECK-LABEL: define void @b
+
+  // Test out a stack block.
+  void (^blk)(void) = ^{(void)p;};
+
+  // CHECK: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // CHECK: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // CHECK: [[BLOCK_DESCRIPTOR_REF_INT:%.*]] = ptrtoint ptr [[BLOCK_DESCRIPTOR_REF]] to i64
+  // CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[BLOCK_DESCRIPTOR_REF_INT]], i64 49339)
+  // CHECK: [[SIGNED_REF:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @"__block_descriptor_36_e5_v8\01?0l" to i64), i32 2, i64 [[BLENDED]])
+  // CHECK: [[SIGNED_REF_PTR:%.*]] = inttoptr i64 [[SIGNED_REF]] to ptr
+  // CHECK: store ptr [[SIGNED_REF_PTR]], ptr [[BLOCK_DESCRIPTOR_REF]]
+}
diff --git a/clang/test/CodeGenObjC/ptrauth-block-isa.m b/clang/test/CodeGenObjC/ptrauth-block-isa.m
index c1e98c6..c37fe8b 100644
--- a/clang/test/CodeGenObjC/ptrauth-block-isa.m
+++ b/clang/test/CodeGenObjC/ptrauth-block-isa.m
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s  -o - | FileCheck %s
+// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s -o - | FileCheck %s
 
 void (^globalblock)(void) = ^{};
-// CHECK: [[GLOBAL_BLOCK:@.*]] = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr [[GLOBAL_BLOCK]]), i32 1342177280, i32 0, ptr @globalblock_block_invoke, ptr @"__block_descriptor_32_e5_v8\01?0l" }, align 8 #0
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }, comdat, align 8
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr @__block_literal_global), i32 1342177280, i32 0, ptr ptrauth (ptr @globalblock_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr [[BLOCK_DESCRIPTOR_NAME]], i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
 
 @interface A
 - (int) count;
diff --git a/clang/test/Driver/linker-wrapper-libs.c b/clang/test/Driver/linker-wrapper-libs.c
deleted file mode 100644
index 1404fe3..0000000
--- a/clang/test/Driver/linker-wrapper-libs.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: nvptx-registered-target
-// REQUIRES: amdgpu-registered-target
-
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
-
-#if defined(RESOLVES)
-int __attribute__((visibility("hidden"))) sym;
-#elif defined(GLOBAL)
-int __attribute__((visibility("protected"))) global;
-#elif defined(WEAK)
-int __attribute__((visibility("hidden"))) weak;
-#elif defined(HIDDEN)
-int __attribute__((visibility("hidden"))) hidden;
-#elif defined(UNDEFINED)
-extern int sym;
-int baz() { return sym; }
-#else
-extern int sym;
-
-extern int __attribute__((weak)) weak;
-
-int foo() { return sym; }
-int bar() { return weak; }
-#endif
-
-//
-// Check that we extract a static library defining an undefined symbol.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DRESOLVES -o %t.nvptx.resolves.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DRESOLVES -o %t.amdgpu.resolves.bc
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DUNDEFINED -o %t.nvptx.undefined.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DUNDEFINED -o %t.amdgpu.undefined.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.undefined.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.undefined.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
-// RUN:   --image=file=%t.nvptx.resolves.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.resolves.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-RESOLVES
-
-// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-
-//
-// Check that we extract a static library that defines a global visibile to the
-// host.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL
-
-// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-
-//
-// Check that we do not extract a global symbol if the source file was not
-// created by an offloading language that expects there to be a host version of
-// the symbol.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.global.bc,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-NONE
-
-// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL-NONE-NOT: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-
-//
-// Check that we do not extract an external weak symbol.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DWEAK -o %t.nvptx.weak.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DWEAK -o %t.amdgpu.weak.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.weak.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.weak.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-WEAK
-
-// LIBRARY-WEAK: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70
-// LIBRARY-WEAK-NOT: {{.*}}.o {{.*}}.o
-// LIBRARY-WEAK: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030
-
-//
-// Check that we do not extract an unneeded hidden symbol.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DHIDDEN -o %t.nvptx.hidden.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DHIDDEN -o %t.amdgpu.hidden.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.hidden.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.hidden.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-HIDDEN
-
-// LIBRARY-HIDDEN: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70
-// LIBRARY-HIDDEN-NOT: {{.*}}.o {{.*}}.o
-// LIBRARY-HIDDEN: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030
-
-//
-// Check that we do not extract a static library that defines a global visibile
-// to the host that is already defined.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-DEFINED
-
-// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL-DEFINED-NOT: {{.*}}gfx1030{{.*}}.o
-// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-
-//
-// Check that we can use --[no-]whole-archive to control extraction.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_52 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o --whole-archive %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-WHOLE-ARCHIVE
-
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=nvptx64-nvidia-cuda -march=sm_52 {{.*}}.o
-// LIBRARY-WHOLE-ARCHIVE: clang{{.*}} -o {{.*}}.img -dumpdir {{.*}}.img. --target=amdgcn-amd-amdhsa -mcpu=gfx90a {{.*}}.o
diff --git a/clang/test/Frontend/skip-function-bodies.cpp b/clang/test/Frontend/skip-function-bodies.cpp
index d0593b4..4cfc4c5 100644
--- a/clang/test/Frontend/skip-function-bodies.cpp
+++ b/clang/test/Frontend/skip-function-bodies.cpp
@@ -1,13 +1,15 @@
 // Trivial check to ensure skip-function-bodies flag is propagated.
 //
-// RUN: %clang_cc1 -verify -skip-function-bodies -pedantic-errors %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -verify -skip-function-bodies %s
 
 int f() {
   // normally this should emit some diags, but we're skipping it!
   this is garbage;
 }
 
+void g() __attribute__((__diagnose_if__(baz))) {}
+// expected-error@-1 {{use of undeclared identifier 'baz'}}
+
 // Make sure we only accept it as a cc1 arg.
 // RUN: not %clang -skip-function-bodies %s 2>&1 | FileCheck %s
 // CHECK: clang: error: unknown argument '-skip-function-bodies'; did you mean '-Xclang -skip-function-bodies'?
diff --git a/clang/test/Modules/befriend-2.cppm b/clang/test/Modules/befriend-2.cppm
new file mode 100644
index 0000000..9d0baf8
--- /dev/null
+++ b/clang/test/Modules/befriend-2.cppm
@@ -0,0 +1,65 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/test-A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/N.cppm -emit-reduced-module-interface -o %t/test-N.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -verify -fsyntax-only -fprebuilt-module-path=%t
+
+//--- a.h
+namespace N {
+
+    template <typename>
+    class C {
+    template <typename> friend void foo();
+    };
+
+    template <typename> void foo() {}
+} // namespace N
+
+//--- a.cppm
+// This is some unrelated file. It also #includes system headers, but
+// here does not even export anything.
+module;
+#include "a.h"
+export module test:A;
+export {
+    using N::C;
+    using N::foo;
+}
+
+//--- std.h
+// Declarations typically #included from C++ header files:
+namespace N {               // In practice, this would be namespace std
+    inline namespace impl {   // In practice, this would be namespace __1
+        template <typename>
+        class C {
+        template <typename> friend void foo();
+        };
+    
+        template <typename> void foo() {}
+    } // namespace impl
+    } // namespace N
+
+//--- N.cppm
+module;
+#include "std.h"
+export module test:N;
+
+// Now wrap these names into a module and export them:
+export {
+    namespace N   {
+        using N::C;
+        using N::foo;
+    }
+}
+
+//--- B.cppm
+// expected-no-diagnostics
+// A file that consumes the partitions from the other two files,
+// including the exported N::C name.
+module test:B;
+import :N;
+import :A;
+
+N::C<int> x;
diff --git a/clang/test/Modules/befriend-3.cppm b/clang/test/Modules/befriend-3.cppm
new file mode 100644
index 0000000..f8dbc423
--- /dev/null
+++ b/clang/test/Modules/befriend-3.cppm
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify
+export module m;
+
+namespace test {
+namespace ns1 {
+    namespace ns2 {
+    template<class T> void f(T t); // expected-note {{target of using declaration}}
+    }
+    using ns2::f; // expected-note {{using declaration}}
+}
+struct A { void f(); }; // expected-note 2{{target of using declaration}}
+struct B : public A { using A::f; }; // expected-note {{using declaration}}
+template<typename T> struct C : A { using A::f; }; // expected-note {{using declaration}}
+struct X {
+    template<class T> friend void ns1::f(T t); // expected-error {{cannot befriend target of using declaration}}
+    friend void B::f(); // expected-error {{cannot befriend target of using declaration}}
+    friend void C<int>::f(); // expected-error {{cannot befriend target of using declaration}}
+};
+}
diff --git a/clang/test/Modules/pr138558.cppm b/clang/test/Modules/pr138558.cppm
new file mode 100644
index 0000000..c637ce2
--- /dev/null
+++ b/clang/test/Modules/pr138558.cppm
@@ -0,0 +1,54 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/test-A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/N.cppm -emit-reduced-module-interface -o %t/test-N.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -verify -fsyntax-only -fprebuilt-module-path=%t
+
+//--- a.h
+namespace N {
+inline namespace impl   {
+    template <typename>
+    class C {
+    template <typename> friend void foo();
+    };
+
+    template <typename> void foo() {}
+} // namespace impl
+} // namespace N
+
+//--- a.cppm
+// This is some unrelated file. It also #includes system headers, but
+// here does not even export anything.
+module;
+#include "a.h"
+export module test:A;
+// To make sure they won't elided.
+using N::C;
+using N::foo;
+
+//--- N.cppm
+module;
+#include "a.h"
+export module test:N;
+
+// Now wrap these names into a module and export them:
+export {
+  namespace N   {
+    inline namespace impl    {
+      using N::impl::C;
+      using N::impl::foo;
+    }
+  }
+}
+
+//--- B.cppm
+// expected-no-diagnostics
+// A file that consumes the partitions from the other two files,
+// including the exported N::C name.
+module test:B;
+import :N;
+import :A;
+
+N::C<int> x;
diff --git a/clang/test/Parser/cxx-variadic-func.cpp b/clang/test/Parser/cxx-variadic-func.cpp
index 98a34d3..73124b8 100644
--- a/clang/test/Parser/cxx-variadic-func.cpp
+++ b/clang/test/Parser/cxx-variadic-func.cpp
@@ -6,3 +6,24 @@ void f(...) {
 }
 
 void h(int n..., int m); // expected-error {{expected ')'}} expected-note {{to match}}
+
+
+namespace GH153445 {
+void f(int = {}...);
+
+struct S {
+  void f(int = {}...);
+  void g(int...);
+};
+
+void S::g(int = {}...) {}
+}
+
+
+template <typename ...T>
+constexpr int a() {return 1;}
+
+struct S2 {
+  template <typename ...Ts>
+  void f(int = a<Ts...>()...);
+};
diff --git a/clang/test/Parser/cxx2c-oxford-variadic-comma.cpp b/clang/test/Parser/cxx2c-oxford-variadic-comma.cpp
index b8015b4..18ce770 100644
--- a/clang/test/Parser/cxx2c-oxford-variadic-comma.cpp
+++ b/clang/test/Parser/cxx2c-oxford-variadic-comma.cpp
@@ -36,6 +36,7 @@ void o(int x, ...);
 
 struct S {
   void p(this S...) {} // expected-warning {{declaration of a variadic function without a comma before '...' is deprecated}}
+  void f(int = {}...); // expected-warning {{declaration of a variadic function without a comma before '...' is deprecated}}
 };
 
 template<class ...Ts>
diff --git a/clang/test/Parser/diagnose_if.cpp b/clang/test/Parser/diagnose_if.cpp
new file mode 100644
index 0000000..5205980
--- /dev/null
+++ b/clang/test/Parser/diagnose_if.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 %s -fsyntax-only -fcxx-exceptions -verify
+
+void t1() __attribute__((__diagnose_if__(baz))) try {} catch(...) {}
+// expected-error@-1 {{use of undeclared identifier 'baz'}}
+
+struct A {
+  A();
+};
+
+A::A() __attribute__((__diagnose_if__(baz))) :;
+// expected-error@-1 {{expected class member or base class name}}
+// expected-error@-2 {{use of undeclared identifier 'baz'}}
diff --git a/clang/test/Parser/explicit-bool-pre-cxx17.cpp b/clang/test/Parser/explicit-bool-pre-cxx17.cpp
new file mode 100644
index 0000000..fee0889
--- /dev/null
+++ b/clang/test/Parser/explicit-bool-pre-cxx17.cpp
@@ -0,0 +1,15 @@
+// Regression test for assertion failure when explicit(bool) is used in pre-C++20
+// Fixes GitHub issue #152729
+// RUN: %clang_cc1 -std=c++98 -verify %s
+// RUN: %clang_cc1 -std=c++03 -verify %s
+// RUN: %clang_cc1 -std=c++11 -verify %s
+// RUN: %clang_cc1 -std=c++14 -verify %s
+// RUN: %clang_cc1 -std=c++17 -verify %s
+
+struct S {
+  explicit(true) S(int);
+  // expected-warning@-1 {{explicit(bool) is a C++20 extension}}
+  
+  explicit(false) S(float);
+  // expected-warning@-1 {{explicit(bool) is a C++20 extension}}
+};
diff --git a/clang/test/Sema/address-packed.c b/clang/test/Sema/address-packed.c
index 29f1249..f826b7d 100644
--- a/clang/test/Sema/address-packed.c
+++ b/clang/test/Sema/address-packed.c
@@ -338,3 +338,11 @@ struct Invalid0 {
 void *g14(struct Invalid0 *ivl) {
   return &(ivl->x);
 }
+
+void to_void_with_expr(void *ptr, int expr);
+
+void g15(void) {
+  struct Arguable arguable;
+  to_void_with_expr(&arguable.x, 3); // no-warning
+  to_void_with_expr(&arguable.x, ({3;})); // no-warning
+}
diff --git a/clang/test/Sema/constant-builtins-vector.cpp b/clang/test/Sema/constant-builtins-vector.cpp
index bc575dc..2b7d76e 100644
--- a/clang/test/Sema/constant-builtins-vector.cpp
+++ b/clang/test/Sema/constant-builtins-vector.cpp
@@ -876,3 +876,21 @@ static_assert(__builtin_elementwise_min(~0U, 0U) == 0U);
 static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_min((vector4char){1, -2, 3, -4}, (vector4char){4, -3, 2, -1})) == (LITTLE_END ? 0xFC02FD01 : 0x01FD02FC));
 static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_min((vector4uchar){1, 2, 3, 4}, (vector4uchar){4, 3, 2, 1})) == 0x01020201U);
 static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_min((vector4short){1, -2, 3, -4}, (vector4short){4, -3, 2, -1})) == (LITTLE_END ? 0xFFFC0002FFFD0001 : 0x0001FFFD0002FFFC));
+
+static_assert(__builtin_elementwise_abs(10) == 10);
+static_assert(__builtin_elementwise_abs(-10) == 10);
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_abs((vector4char){-1, -2, -3, 4})) == (LITTLE_END ? 0x04030201 : 0x01020304));
+static_assert(__builtin_elementwise_abs((int)(-2147483648)) == (int)(-2147483648)); // the absolute value of the most negative integer remains the most negative integer
+
+// check floating point for elementwise abs
+#define CHECK_FOUR_FLOAT_VEC(vec1, vec2) \
+    static_assert(__builtin_fabs(vec1[0] - vec2[0]) < 1e-6); \
+    static_assert(__builtin_fabs(vec1[1] - vec2[1]) < 1e-6); \
+    static_assert(__builtin_fabs(vec1[2] - vec2[2]) < 1e-6); \
+    static_assert(__builtin_fabs(vec1[3] - vec2[3]) < 1e-6);
+
+// checking floating point vector
+CHECK_FOUR_FLOAT_VEC(__builtin_elementwise_abs((vector4float){-1.123, 2.123, -3.123, 4.123}), ((vector4float){1.123, 2.123, 3.123, 4.123}))
+CHECK_FOUR_FLOAT_VEC(__builtin_elementwise_abs((vector4double){-1.123, 2.123, -3.123, 4.123}), ((vector4double){1.123, 2.123, 3.123, 4.123}))
+static_assert(__builtin_elementwise_abs((float)-1.123) - (float)1.123 < 1e-6); // making sure one element works
+#undef CHECK_FOUR_FLOAT_VEC
diff --git a/clang/test/Sema/format-strings-signedness.c b/clang/test/Sema/format-strings-signedness.c
index d5a8140..773ff41 100644
--- a/clang/test/Sema/format-strings-signedness.c
+++ b/clang/test/Sema/format-strings-signedness.c
@@ -39,13 +39,13 @@ void test_printf_unsigned_char(unsigned char x)
 void test_printf_int(int x)
 {
     printf("%d", x); // no-warning
-    printf("%u", x); // expected-warning{{format specifies type 'unsigned int' but the argument has type 'int'}}
-    printf("%x", x); // expected-warning{{format specifies type 'unsigned int' but the argument has type 'int'}}
+    printf("%u", x); // expected-warning{{format specifies type 'unsigned int' but the argument has type 'int', which differs in signedness}}
+    printf("%x", x); // expected-warning{{format specifies type 'unsigned int' but the argument has type 'int', which differs in signedness}}
 }
 
 void test_printf_unsigned(unsigned x)
 {
-    printf("%d", x); // expected-warning{{format specifies type 'int' but the argument has type 'unsigned int'}}
+    printf("%d", x); // expected-warning{{format specifies type 'int' but the argument has type 'unsigned int', which differs in signedness}}
     printf("%u", x); // no-warning
     printf("%x", x); // no-warning
 }
@@ -53,13 +53,13 @@ void test_printf_unsigned(unsigned x)
 void test_printf_long(long x)
 {
     printf("%ld", x); // no-warning
-    printf("%lu", x); // expected-warning{{format specifies type 'unsigned long' but the argument has type 'long'}}
-    printf("%lx", x); // expected-warning{{format specifies type 'unsigned long' but the argument has type 'long'}}
+    printf("%lu", x); // expected-warning{{format specifies type 'unsigned long' but the argument has type 'long', which differs in signedness}}
+    printf("%lx", x); // expected-warning{{format specifies type 'unsigned long' but the argument has type 'long', which differs in signedness}}
 }
 
 void test_printf_unsigned_long(unsigned long x)
 {
-    printf("%ld", x); // expected-warning{{format specifies type 'long' but the argument has type 'unsigned long'}}
+    printf("%ld", x); // expected-warning{{format specifies type 'long' but the argument has type 'unsigned long', which differs in signedness}}
     printf("%lu", x); // no-warning
     printf("%lx", x); // no-warning
 }
@@ -67,13 +67,13 @@ void test_printf_unsigned_long(unsigned long x)
 void test_printf_long_long(long long x)
 {
     printf("%lld", x); // no-warning
-    printf("%llu", x); // expected-warning{{format specifies type 'unsigned long long' but the argument has type 'long long'}}
-    printf("%llx", x); // expected-warning{{format specifies type 'unsigned long long' but the argument has type 'long long'}}
+    printf("%llu", x); // expected-warning{{format specifies type 'unsigned long long' but the argument has type 'long long', which differs in signedness}}
+    printf("%llx", x); // expected-warning{{format specifies type 'unsigned long long' but the argument has type 'long long', which differs in signedness}}
 }
 
 void test_printf_unsigned_long_long(unsigned long long x)
 {
-    printf("%lld", x); // expected-warning{{format specifies type 'long long' but the argument has type 'unsigned long long'}}
+    printf("%lld", x); // expected-warning{{format specifies type 'long long' but the argument has type 'unsigned long long', which differs in signedness}}
     printf("%llu", x); // no-warning
     printf("%llx", x); // no-warning
 }
@@ -85,8 +85,8 @@ enum enum_int {
 void test_printf_enum_int(enum enum_int x)
 {
     printf("%d", x); // no-warning
-    printf("%u", x); // expected-warning{{format specifies type 'unsigned int' but the argument has underlying type 'int'}}
-    printf("%x", x); // expected-warning{{format specifies type 'unsigned int' but the argument has underlying type 'int'}}
+    printf("%u", x); // expected-warning{{format specifies type 'unsigned int' but the argument has underlying type 'int', which differs in signedness}}
+    printf("%x", x); // expected-warning{{format specifies type 'unsigned int' but the argument has underlying type 'int', which differs in signedness}}
 }
 
 #ifndef _WIN32 // Disabled due to enums have different underlying type on _WIN32
@@ -96,7 +96,7 @@ enum enum_unsigned {
 
 void test_printf_enum_unsigned(enum enum_unsigned x)
 {
-    printf("%d", x); // expected-warning{{format specifies type 'int' but the argument has underlying type 'unsigned int'}}
+    printf("%d", x); // expected-warning{{format specifies type 'int' but the argument has underlying type 'unsigned int', which differs in signedness}}
     printf("%u", x); // no-warning
     printf("%x", x); // no-warning
 }
@@ -110,8 +110,8 @@ enum enum_long {
 void test_printf_enum_long(enum enum_long x)
 {
     printf("%ld", x); // no-warning
-    printf("%lu", x); // expected-warning{{format specifies type 'unsigned long' but the argument has underlying type 'long'}}
-    printf("%lx", x); // expected-warning{{format specifies type 'unsigned long' but the argument has underlying type 'long'}}
+    printf("%lu", x); // expected-warning{{format specifies type 'unsigned long' but the argument has underlying type 'long', which differs in signedness}}
+    printf("%lx", x); // expected-warning{{format specifies type 'unsigned long' but the argument has underlying type 'long', which differs in signedness}}
 }
 
 enum enum_unsigned_long {
@@ -120,7 +120,7 @@ enum enum_unsigned_long {
 
 void test_printf_enum_unsigned_long(enum enum_unsigned_long x)
 {
-    printf("%ld", x); // expected-warning{{format specifies type 'long' but the argument has underlying type 'unsigned long'}}
+    printf("%ld", x); // expected-warning{{format specifies type 'long' but the argument has underlying type 'unsigned long', which differs in signedness}}
     printf("%lu", x); // no-warning
     printf("%lx", x); // no-warning
 }
@@ -136,61 +136,61 @@ void test_scanf_unsigned_char(unsigned char *y) {
 
 void test_scanf_int(int *x) {
   scanf("%d", x); // no-warning
-  scanf("%u", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'int *'}}
-  scanf("%x", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'int *'}}
+  scanf("%u", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'int *', which differs in signedness}}
+  scanf("%x", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'int *', which differs in signedness}}
 }
 
 void test_scanf_unsigned(unsigned *x) {
-  scanf("%d", x); // expected-warning{{format specifies type 'int *' but the argument has type 'unsigned int *'}}
+  scanf("%d", x); // expected-warning{{format specifies type 'int *' but the argument has type 'unsigned int *', which differs in signedness}}
   scanf("%u", x); // no-warning
   scanf("%x", x); // no-warning
 }
 
 void test_scanf_long(long *x) {
   scanf("%ld", x); // no-warning
-  scanf("%lu", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'long *'}}
-  scanf("%lx", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'long *'}}
+  scanf("%lu", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'long *', which differs in signedness}}
+  scanf("%lx", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'long *', which differs in signedness}}
 }
 
 void test_scanf_unsigned_long(unsigned long *x) {
-  scanf("%ld", x); // expected-warning{{format specifies type 'long *' but the argument has type 'unsigned long *'}}
+  scanf("%ld", x); // expected-warning{{format specifies type 'long *' but the argument has type 'unsigned long *', which differs in signedness}}
   scanf("%lu", x); // no-warning
   scanf("%lx", x); // no-warning
 }
 
 void test_scanf_longlong(long long *x) {
   scanf("%lld", x); // no-warning
-  scanf("%llu", x); // expected-warning{{format specifies type 'unsigned long long *' but the argument has type 'long long *'}}
-  scanf("%llx", x); // expected-warning{{format specifies type 'unsigned long long *' but the argument has type 'long long *'}}
+  scanf("%llu", x); // expected-warning{{format specifies type 'unsigned long long *' but the argument has type 'long long *', which differs in signedness}}
+  scanf("%llx", x); // expected-warning{{format specifies type 'unsigned long long *' but the argument has type 'long long *', which differs in signedness}}
 }
 
 void test_scanf_unsigned_longlong(unsigned long long *x) {
-  scanf("%lld", x); // expected-warning{{format specifies type 'long long *' but the argument has type 'unsigned long long *'}}
+  scanf("%lld", x); // expected-warning{{format specifies type 'long long *' but the argument has type 'unsigned long long *', which differs in signedness}}
   scanf("%llu", x); // no-warning
   scanf("%llx", x); // no-warning
 }
 
 void test_scanf_enum_int(enum enum_int *x) {
   scanf("%d", x); // no-warning
-  scanf("%u", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'enum enum_int *'}}
-  scanf("%x", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'enum enum_int *'}}
+  scanf("%u", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'enum enum_int *', which differs in signedness}}
+  scanf("%x", x); // expected-warning{{format specifies type 'unsigned int *' but the argument has type 'enum enum_int *', which differs in signedness}}
 }
 
 #ifndef _WIN32 // Disabled due to enums have different underlying type on _WIN32
 void test_scanf_enum_unsigned(enum enum_unsigned *x) {
-  scanf("%d", x); // expected-warning{{format specifies type 'int *' but the argument has type 'enum enum_unsigned *'}}
+  scanf("%d", x); // expected-warning{{format specifies type 'int *' but the argument has type 'enum enum_unsigned *', which differs in signedness}}
   scanf("%u", x); // no-warning
   scanf("%x", x); // no-warning
 }
 
 void test_scanf_enum_long(enum enum_long *x) {
   scanf("%ld", x); // no-warning
-  scanf("%lu", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'enum enum_long *'}}
-  scanf("%lx", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'enum enum_long *'}}
+  scanf("%lu", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'enum enum_long *', which differs in signedness}}
+  scanf("%lx", x); // expected-warning{{format specifies type 'unsigned long *' but the argument has type 'enum enum_long *', which differs in signedness}}
 }
 
 void test_scanf_enum_unsigned_long(enum enum_unsigned_long *x) {
-  scanf("%ld", x); // expected-warning{{format specifies type 'long *' but the argument has type 'enum enum_unsigned_long *'}}
+  scanf("%ld", x); // expected-warning{{format specifies type 'long *' but the argument has type 'enum enum_unsigned_long *', which differs in signedness}}
   scanf("%lu", x); // no-warning
   scanf("%lx", x); // no-warning
 }
diff --git a/clang/test/SemaOpenACC/init-construct.cpp b/clang/test/SemaOpenACC/init-construct.cpp
index abc7f74..d553589 100644
--- a/clang/test/SemaOpenACC/init-construct.cpp
+++ b/clang/test/SemaOpenACC/init-construct.cpp
@@ -34,6 +34,12 @@ void uses() {
   // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc init device_num(Explicit)
+
+  // expected-error@+1{{OpenACC 'device_type' clause on a 'init' construct only permits one architecture}}
+#pragma acc init device_type(nvidia, radeon)
+
+  // expected-error@+1{{OpenACC 'device_type' clause on a 'init' construct only permits one architecture}}
+#pragma acc init  device_type(nonsense, nvidia, radeon)
 }
 
 template<typename T>
diff --git a/clang/test/SemaOpenACC/shutdown-construct.cpp b/clang/test/SemaOpenACC/shutdown-construct.cpp
index 95cea90..e08a968 100644
--- a/clang/test/SemaOpenACC/shutdown-construct.cpp
+++ b/clang/test/SemaOpenACC/shutdown-construct.cpp
@@ -34,6 +34,12 @@ void uses() {
   // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc shutdown device_num(Explicit)
+
+  // expected-error@+1{{OpenACC 'device_type' clause on a 'shutdown' construct only permits one architecture}}
+#pragma acc shutdown device_type(nvidia, radeon)
+
+  // expected-error@+1{{OpenACC 'device_type' clause on a 'shutdown' construct only permits one architecture}}
+#pragma acc shutdown device_type(nonsense, nvidia, radeon)
 }
 
 template<typename T>
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1d91f5f2..a56e758 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -911,9 +911,9 @@ Error handleOverrideImages(
 
 /// Transforms all the extracted offloading input files into an image that can
 /// be registered by the runtime.
-Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
-    SmallVectorImpl<SmallVector<OffloadFile>> &LinkerInputFiles,
-    const InputArgList &Args, char **Argv, int Argc) {
+Expected<SmallVector<StringRef>>
+linkAndWrapDeviceFiles(ArrayRef<SmallVector<OffloadFile>> LinkerInputFiles,
+                       const InputArgList &Args, char **Argv, int Argc) {
   llvm::TimeTraceScope TimeScope("Handle all device input");
 
   std::mutex ImageMtx;
@@ -1069,147 +1069,6 @@ std::optional<std::string> searchLibrary(StringRef Input, StringRef Root,
   return searchLibraryBaseName(Input, Root, SearchPaths);
 }
 
-/// Common redeclaration of needed symbol flags.
-enum Symbol : uint32_t {
-  Sym_None = 0,
-  Sym_Undefined = 1U << 1,
-  Sym_Weak = 1U << 2,
-};
-
-/// Scan the symbols from a BitcodeFile \p Buffer and record if we need to
-/// extract any symbols from it.
-Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
-                                     bool IsArchive, StringSaver &Saver,
-                                     DenseMap<StringRef, Symbol> &Syms) {
-  Expected<IRSymtabFile> IRSymtabOrErr = readIRSymtab(Buffer);
-  if (!IRSymtabOrErr)
-    return IRSymtabOrErr.takeError();
-
-  bool ShouldExtract = !IsArchive;
-  DenseMap<StringRef, Symbol> TmpSyms;
-  for (unsigned I = 0; I != IRSymtabOrErr->Mods.size(); ++I) {
-    for (const auto &Sym : IRSymtabOrErr->TheReader.module_symbols(I)) {
-      if (Sym.isFormatSpecific() || !Sym.isGlobal())
-        continue;
-
-      auto It = Syms.find(Sym.getName());
-      bool NewSymbol = It == Syms.end();
-      auto OldSym = NewSymbol ? Sym_None : It->second;
-
-      // We will extract if it defines a currenlty undefined non-weak
-      // symbol.
-      bool ResolvesStrongReference =
-          ((OldSym & Sym_Undefined && !(OldSym & Sym_Weak)) &&
-           !Sym.isUndefined());
-      // We will extract if it defines a new global symbol visible to the
-      // host. This is only necessary for code targeting an offloading
-      // language.
-      bool NewGlobalSymbol =
-          ((NewSymbol || (OldSym & Sym_Undefined)) && !Sym.isUndefined() &&
-           !Sym.canBeOmittedFromSymbolTable() && Kind != object::OFK_None &&
-           (Sym.getVisibility() != GlobalValue::HiddenVisibility));
-      ShouldExtract |= ResolvesStrongReference | NewGlobalSymbol;
-
-      // Update this symbol in the "table" with the new information.
-      if (OldSym & Sym_Undefined && !Sym.isUndefined())
-        TmpSyms[Saver.save(Sym.getName())] =
-            static_cast<Symbol>(OldSym & ~Sym_Undefined);
-      if (Sym.isUndefined() && NewSymbol)
-        TmpSyms[Saver.save(Sym.getName())] =
-            static_cast<Symbol>(OldSym | Sym_Undefined);
-      if (Sym.isWeak())
-        TmpSyms[Saver.save(Sym.getName())] =
-            static_cast<Symbol>(OldSym | Sym_Weak);
-    }
-  }
-
-  // If the file gets extracted we update the table with the new symbols.
-  if (ShouldExtract)
-    Syms.insert_range(TmpSyms);
-
-  return ShouldExtract;
-}
-
-/// Scan the symbols from an ObjectFile \p Obj and record if we need to extract
-/// any symbols from it.
-Expected<bool> getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind,
-                                    bool IsArchive, StringSaver &Saver,
-                                    DenseMap<StringRef, Symbol> &Syms) {
-  bool ShouldExtract = !IsArchive;
-  DenseMap<StringRef, Symbol> TmpSyms;
-  for (SymbolRef Sym : Obj.symbols()) {
-    auto FlagsOrErr = Sym.getFlags();
-    if (!FlagsOrErr)
-      return FlagsOrErr.takeError();
-
-    if (!(*FlagsOrErr & SymbolRef::SF_Global) ||
-        (*FlagsOrErr & SymbolRef::SF_FormatSpecific))
-      continue;
-
-    auto NameOrErr = Sym.getName();
-    if (!NameOrErr)
-      return NameOrErr.takeError();
-
-    bool NewSymbol = Syms.count(*NameOrErr) == 0;
-    auto OldSym = NewSymbol ? Sym_None : Syms[*NameOrErr];
-
-    // We will extract if it defines a currenlty undefined non-weak symbol.
-    bool ResolvesStrongReference = (OldSym & Sym_Undefined) &&
-                                   !(OldSym & Sym_Weak) &&
-                                   !(*FlagsOrErr & SymbolRef::SF_Undefined);
-
-    // We will extract if it defines a new global symbol visible to the
-    // host. This is only necessary for code targeting an offloading
-    // language.
-    bool NewGlobalSymbol =
-        ((NewSymbol || (OldSym & Sym_Undefined)) &&
-         !(*FlagsOrErr & SymbolRef::SF_Undefined) && Kind != object::OFK_None &&
-         !(*FlagsOrErr & SymbolRef::SF_Hidden));
-    ShouldExtract |= ResolvesStrongReference | NewGlobalSymbol;
-
-    // Update this symbol in the "table" with the new information.
-    if (OldSym & Sym_Undefined && !(*FlagsOrErr & SymbolRef::SF_Undefined))
-      TmpSyms[Saver.save(*NameOrErr)] =
-          static_cast<Symbol>(OldSym & ~Sym_Undefined);
-    if (*FlagsOrErr & SymbolRef::SF_Undefined && NewSymbol)
-      TmpSyms[Saver.save(*NameOrErr)] =
-          static_cast<Symbol>(OldSym | Sym_Undefined);
-    if (*FlagsOrErr & SymbolRef::SF_Weak)
-      TmpSyms[Saver.save(*NameOrErr)] = static_cast<Symbol>(OldSym | Sym_Weak);
-  }
-
-  // If the file gets extracted we update the table with the new symbols.
-  if (ShouldExtract)
-    Syms.insert_range(TmpSyms);
-
-  return ShouldExtract;
-}
-
-/// Attempt to 'resolve' symbols found in input files. We use this to
-/// determine if an archive member needs to be extracted. An archive member
-/// will be extracted if any of the following is true.
-///   1) It defines an undefined symbol in a regular object filie.
-///   2) It defines a global symbol without hidden visibility that has not
-///      yet been defined.
-Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
-                          StringSaver &Saver,
-                          DenseMap<StringRef, Symbol> &Syms) {
-  MemoryBufferRef Buffer = MemoryBufferRef(Image, "");
-  switch (identify_magic(Image)) {
-  case file_magic::bitcode:
-    return getSymbolsFromBitcode(Buffer, Kind, IsArchive, Saver, Syms);
-  case file_magic::elf_relocatable: {
-    Expected<std::unique_ptr<ObjectFile>> ObjFile =
-        ObjectFile::createObjectFile(Buffer);
-    if (!ObjFile)
-      return ObjFile.takeError();
-    return getSymbolsFromObject(**ObjFile, Kind, IsArchive, Saver, Syms);
-  }
-  default:
-    return false;
-  }
-}
-
 /// Search the input files and libraries for embedded device offloading code
 /// and add it to the list of files to be linked. Files coming from static
 /// libraries are only added to the input if they are used by an existing
@@ -1279,7 +1138,6 @@ getDeviceInput(const ArgList &Args) {
 
   // Link all standard input files and update the list of symbols.
   MapVector<OffloadFile::TargetID, SmallVector<OffloadFile, 0>> InputFiles;
-  DenseMap<OffloadFile::TargetID, DenseMap<StringRef, Symbol>> Syms;
   for (OffloadFile &Binary : ObjectFilesToExtract) {
     if (!Binary.getBinary())
       continue;
@@ -1290,12 +1148,6 @@ getDeviceInput(const ArgList &Args) {
         CompatibleTargets.emplace_back(ID);
 
     for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
-      Expected<bool> ExtractOrErr = getSymbols(
-          Binary.getBinary()->getImage(), Binary.getBinary()->getOffloadKind(),
-          /*IsArchive=*/false, Saver, Syms[ID]);
-      if (!ExtractOrErr)
-        return ExtractOrErr.takeError();
-
       // If another target needs this binary it must be copied instead.
       if (Index == CompatibleTargets.size() - 1)
         InputFiles[ID].emplace_back(std::move(Binary));
@@ -1304,55 +1156,33 @@ getDeviceInput(const ArgList &Args) {
     }
   }
 
-  // Archive members only extract if they define needed symbols. We do this
-  // after every regular input file so that libraries may be included out of
-  // order. This follows 'ld.lld' semantics which are more lenient.
-  bool Extracted = true;
   llvm::DenseSet<StringRef> ShouldExtract;
   for (auto &Arg : Args.getAllArgValues(OPT_should_extract))
     ShouldExtract.insert(Arg);
-  while (Extracted) {
-    Extracted = false;
-    for (OffloadFile &Binary : ArchiveFilesToExtract) {
-      // If the binary was previously extracted it will be set to null.
-      if (!Binary.getBinary())
-        continue;
-
-      SmallVector<OffloadFile::TargetID> CompatibleTargets = {Binary};
-      for (const auto &[ID, Input] : InputFiles)
-        if (object::areTargetsCompatible(Binary, ID))
-          CompatibleTargets.emplace_back(ID);
-
-      for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
-        // Only extract an if we have an an object matching this target or it
-        // was specifically requested.
-        if (!InputFiles.count(ID) && !ShouldExtract.contains(ID.second))
-          continue;
-
-        Expected<bool> ExtractOrErr =
-            getSymbols(Binary.getBinary()->getImage(),
-                       Binary.getBinary()->getOffloadKind(),
-                       /*IsArchive=*/true, Saver, Syms[ID]);
-        if (!ExtractOrErr)
-          return ExtractOrErr.takeError();
 
-        Extracted = *ExtractOrErr;
+  // We only extract archive members from the fat binary if we find a used or
+  // requested target. Unlike normal static archive handling, we just extract
+  // every object file contained in the archive.
+  for (OffloadFile &Binary : ArchiveFilesToExtract) {
+    if (!Binary.getBinary())
+      continue;
 
-        // Skip including the file if it is an archive that does not resolve
-        // any symbols.
-        if (!Extracted && !ShouldExtract.contains(ID.second))
-          continue;
+    SmallVector<OffloadFile::TargetID> CompatibleTargets = {Binary};
+    for (const auto &[ID, Input] : InputFiles)
+      if (object::areTargetsCompatible(Binary, ID))
+        CompatibleTargets.emplace_back(ID);
 
-        // If another target needs this binary it must be copied instead.
-        if (Index == CompatibleTargets.size() - 1)
-          InputFiles[ID].emplace_back(std::move(Binary));
-        else
-          InputFiles[ID].emplace_back(Binary.copy());
-      }
+    for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
+      // Only extract an if we have an an object matching this target or it
+      // was specifically requested.
+      if (!InputFiles.count(ID) && !ShouldExtract.contains(ID.second))
+        continue;
 
-      // If we extracted any files we need to check all the symbols again.
-      if (Extracted)
-        break;
+      // If another target needs this binary it must be copied instead.
+      if (Index == CompatibleTargets.size() - 1)
+        InputFiles[ID].emplace_back(std::move(Binary));
+      else
+        InputFiles[ID].emplace_back(Binary.copy());
     }
   }
 
diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index 1dda902..b6825f9 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -1,3 +1,34 @@
+if(EMSCRIPTEN)
+set(LLVM_COMPONENTS_TO_LINK
+  ""
+  )
+set(LLVM_LIBS_TO_LINK
+  ""
+  )
+set(CLANG_LIBS_TO_LINK
+  clangInterpreter
+  )
+else()
+set(LLVM_COMPONENTS_TO_LINK
+  ${LLVM_TARGETS_TO_BUILD}
+  Core
+  MC
+  OrcJIT
+  Support
+  TargetParser
+  )
+set(LLVM_LIBS_TO_LINK
+  LLVMTestingSupport
+  )
+set(CLANG_LIBS_TO_LINK
+  clangAST
+  clangBasic
+  clangInterpreter
+  clangFrontend
+  clangSema
+  )
+endif()
+
 add_distinct_clang_unittest(ClangReplInterpreterTests
   IncrementalCompilerBuilderTest.cpp
   IncrementalProcessingTest.cpp
@@ -8,24 +39,33 @@ add_distinct_clang_unittest(ClangReplInterpreterTests
   EXPORT_SYMBOLS
 
   CLANG_LIBS
-  clangAST
-  clangBasic
-  clangInterpreter
-  clangFrontend
-  clangSema
+  ${CLANG_LIBS_TO_LINK}
 
   LINK_LIBS
-  LLVMTestingSupport
+  ${LLVM_LIBS_TO_LINK}
 
   LLVM_COMPONENTS
-  ${LLVM_TARGETS_TO_BUILD}
-  Core
-  MC
-  OrcJIT
-  Support
-  TargetParser
+  ${LLVM_COMPONENTS_TO_LINK}
   )
 
+if(EMSCRIPTEN)
+# Without the above you try to link to LLVMSupport twice, and end
+# up with a duplicate symbol error when creating the main module
+get_target_property(LINKED_LIBS ClangReplInterpreterTests LINK_LIBRARIES)
+list(REMOVE_ITEM LINKED_LIBS LLVMSupport)
+set_target_properties(ClangReplInterpreterTests PROPERTIES LINK_LIBRARIES "${LINKED_LIBS}")
+target_link_options(ClangReplInterpreterTests
+  PUBLIC "SHELL: -s MAIN_MODULE=1"
+  PUBLIC "SHELL: -s ALLOW_MEMORY_GROWTH=1"
+  PUBLIC "SHELL: -s STACK_SIZE=32mb"
+  PUBLIC "SHELL: -s INITIAL_MEMORY=128mb"
+  PUBLIC "SHELL: --emrun"
+)
+set_target_properties(ClangReplInterpreterTests PROPERTIES
+  SUFFIX ".html"
+)
+endif()
+
 # Exceptions on Windows are not yet supported.
 if(NOT WIN32)
   add_subdirectory(ExceptionTests)
diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp
index 23cfc46..ceb6834 100644
--- a/clang/unittests/Interpreter/CodeCompletionTest.cpp
+++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp
@@ -29,8 +29,14 @@ public:
   std::unique_ptr<clang::Interpreter> Interp;
 
   void SetUp() override {
+// FIXME : WebAssembly doesn't currently support Jit (see
+// https: // github.com/llvm/llvm-project/pull/150977#discussion_r2237521095).
+// so this check of HostSupportsJIT has been skipped
+// over until support is added, and HostSupportsJIT can return true.
+#ifndef __EMSCRIPTEN__
     if (!HostSupportsJIT())
       GTEST_SKIP();
+#endif
     std::unique_ptr<CompilerInstance> CI = cantFail(CB.CreateCpp());
     this->Interp = cantFail(clang::Interpreter::create(std::move(CI)));
   }
diff --git a/clang/unittests/Interpreter/IncrementalCompilerBuilderTest.cpp b/clang/unittests/Interpreter/IncrementalCompilerBuilderTest.cpp
index c4a4007..7b4633b 100644
--- a/clang/unittests/Interpreter/IncrementalCompilerBuilderTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalCompilerBuilderTest.cpp
@@ -37,6 +37,14 @@ TEST(IncrementalCompilerBuilder, SetCompilerArgs) {
 }
 
 TEST(IncrementalCompilerBuilder, SetTargetTriple) {
+// FIXME : This test doesn't current work for Emscripten builds.
+// It should be possible to make it work.For details on how it fails and
+// the current progress to enable this test see
+// the following Github issue https: //
+// github.com/llvm/llvm-project/issues/153461
+#ifdef __EMSCRIPTEN__
+  GTEST_SKIP() << "Test fails for Emscipten builds";
+#endif
   auto CB = clang::IncrementalCompilerBuilder();
   CB.SetTargetTriple("armv6-none-eabi");
   auto CI = cantFail(CB.CreateCpp());
diff --git a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
index 1c27cfb..f50f6e3 100644
--- a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
@@ -75,9 +75,14 @@ struct OutOfProcInterpreter : public Interpreter {
 };
 
 TEST_F(InterpreterExtensionsTest, FindRuntimeInterface) {
+// FIXME : WebAssembly doesn't currently support Jit (see
+// https: // github.com/llvm/llvm-project/pull/150977#discussion_r2237521095).
+// so this check of HostSupportsJIT has been skipped
+// over until support is added, and HostSupportsJIT can return true.
+#ifndef __EMSCRIPTEN__
   if (!HostSupportsJIT())
     GTEST_SKIP();
-
+#endif
   clang::IncrementalCompilerBuilder CB;
   llvm::Error ErrOut = llvm::Error::success();
   auto CI = cantFail(CB.CreateCpp());
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index 768058b..8639fb6 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -147,6 +147,14 @@ TEST_F(InterpreterTest, DeclsAndStatements) {
 }
 
 TEST_F(InterpreterTest, UndoCommand) {
+// FIXME : This test doesn't current work for Emscripten builds.
+// It should be possible to make it work.For details on how it fails and
+// the current progress to enable this test see
+// the following Github issue https: //
+// github.com/llvm/llvm-project/issues/153461
+#ifdef __EMSCRIPTEN__
+  GTEST_SKIP() << "Test fails for Emscipten builds";
+#endif
   Args ExtraArgs = {"-Xclang", "-diagnostic-log-file", "-Xclang", "-"};
 
   // Create the diagnostic engine with unowned consumer.
@@ -256,6 +264,14 @@ static NamedDecl *LookupSingleName(Interpreter &Interp, const char *Name) {
 }
 
 TEST_F(InterpreterTest, InstantiateTemplate) {
+// FIXME : This test doesn't current work for Emscripten builds.
+// It should be possible to make it work.For details on how it fails and
+// the current progress to enable this test see
+// the following Github issue https: //
+// github.com/llvm/llvm-project/issues/153461
+#ifdef __EMSCRIPTEN__
+  GTEST_SKIP() << "Test fails for Emscipten builds";
+#endif
   // FIXME: We cannot yet handle delayed template parsing. If we run with
   // -fdelayed-template-parsing we try adding the newly created decl to the
   // active PTU which causes an assert.
@@ -295,6 +311,14 @@ TEST_F(InterpreterTest, InstantiateTemplate) {
 }
 
 TEST_F(InterpreterTest, Value) {
+// FIXME : This test doesn't current work for Emscripten builds.
+// It should be possible to make it work.For details on how it fails and
+// the current progress to enable this test see
+// the following Github issue https: //
+// github.com/llvm/llvm-project/issues/153461
+#ifdef __EMSCRIPTEN__
+  GTEST_SKIP() << "Test fails for Emscipten builds";
+#endif
   std::vector<const char *> Args = {"-fno-sized-deallocation"};
   std::unique_ptr<Interpreter> Interp = createInterpreter(Args);
 
diff --git a/clang/unittests/Interpreter/InterpreterTestFixture.h b/clang/unittests/Interpreter/InterpreterTestFixture.h
index 113599f..b088fa4 100644
--- a/clang/unittests/Interpreter/InterpreterTestFixture.h
+++ b/clang/unittests/Interpreter/InterpreterTestFixture.h
@@ -38,8 +38,14 @@ protected:
   }
 
   void SetUp() override {
+// FIXME : WebAssembly doesn't currently support Jit (see
+// https: // github.com/llvm/llvm-project/pull/150977#discussion_r2237521095).
+// so this check of HostSupportsJIT has been skipped
+// over until support is added, and HostSupportsJIT can return true.
+#ifndef __EMSCRIPTEN__
     if (!HostSupportsJIT())
       GTEST_SKIP();
+#endif
   }
 
   void TearDown() override {}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index e2da20e..af2dcf6 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -972,10 +972,10 @@ Intrinsic::Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
       BaseType(BT, 'd'), Flags(Flags), ImmChecks(Checks) {
 
   auto FormatGuard = [](StringRef Guard, StringRef Base) -> std::string {
+    if (Guard.empty() || Guard == Base)
+      return Guard.str();
     if (Guard.contains('|'))
       return Base.str() + ",(" + Guard.str() + ")";
-    if (Guard.empty() || Guard == Base || Guard.starts_with(Base.str() + ","))
-      return Guard.str();
     return Base.str() + "," + Guard.str();
   };
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 3bc24152..d4811ff 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -769,11 +769,17 @@ void internal_join_thread(void *th) { pthread_join((pthread_t)th, 0); }
 static Mutex syslog_lock;
 #  endif
 
+#  if SANITIZER_DRIVERKIT
+#    define SANITIZER_OS_LOG os_log
+#  else
+#    define SANITIZER_OS_LOG os_log_error
+#  endif
+
 void WriteOneLineToSyslog(const char *s) {
 #if !SANITIZER_GO
   syslog_lock.CheckLocked();
   if (GetMacosAlignedVersion() >= MacosVersion(10, 12)) {
-    os_log_error(OS_LOG_DEFAULT, "%{public}s", s);
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT, "%{public}s", s);
   } else {
 #pragma clang diagnostic push
 // as_log is deprecated.
@@ -837,22 +843,22 @@ void LogMessageOnPrintf(const char *str) {
 
 void LogFullErrorReport(const char *buffer) {
 #  if !SANITIZER_GO
-  // Log with os_log_error. This will make it into the crash log.
+  // When logging with os_log_error this will make it into the crash log.
   if (internal_strncmp(SanitizerToolName, "AddressSanitizer",
                        sizeof("AddressSanitizer") - 1) == 0)
-    os_log_error(OS_LOG_DEFAULT, "Address Sanitizer reported a failure.");
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT, "Address Sanitizer reported a failure.");
   else if (internal_strncmp(SanitizerToolName, "UndefinedBehaviorSanitizer",
                             sizeof("UndefinedBehaviorSanitizer") - 1) == 0)
-    os_log_error(OS_LOG_DEFAULT,
-                 "Undefined Behavior Sanitizer reported a failure.");
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT,
+                     "Undefined Behavior Sanitizer reported a failure.");
   else if (internal_strncmp(SanitizerToolName, "ThreadSanitizer",
                             sizeof("ThreadSanitizer") - 1) == 0)
-    os_log_error(OS_LOG_DEFAULT, "Thread Sanitizer reported a failure.");
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT, "Thread Sanitizer reported a failure.");
   else
-    os_log_error(OS_LOG_DEFAULT, "Sanitizer tool reported a failure.");
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT, "Sanitizer tool reported a failure.");
 
   if (common_flags()->log_to_syslog)
-    os_log_error(OS_LOG_DEFAULT, "Consult syslog for more information.");
+    SANITIZER_OS_LOG(OS_LOG_DEFAULT, "Consult syslog for more information.");
 
   // Log to syslog.
   // The logging on OS X may call pthread_create so we need the threading
diff --git a/compiler-rt/test/fuzzer/focus-function.test b/compiler-rt/test/fuzzer/focus-function.test
index 64fd5ee..aa15692 100644
--- a/compiler-rt/test/fuzzer/focus-function.test
+++ b/compiler-rt/test/fuzzer/focus-function.test
@@ -7,15 +7,15 @@ UNSUPPORTED: target=aarch64{{.*}}
 
 RUN: %cpp_compiler %S/OnlySomeBytesTest.cpp -o %t-exe
 
-RUN: %t-exe -runs=100 2>&1 | FileCheck %s --check-prefix=FOCUS_NONE
+RUN: %run %t-exe -runs=100 2>&1 | FileCheck %s --check-prefix=FOCUS_NONE
 FOCUS_NONE-NOT: INFO: Focus function is set to
 FOCUS_NONE-NOT: INFO: {{.*}} inputs touch the focus function
 
-RUN: not %t-exe -runs=100 -focus_function=WRONG 2>&1 | FileCheck %s --check-prefix=FOCUS_WRONG
+RUN: not %run %t-exe -runs=100 -focus_function=WRONG 2>&1 | FileCheck %s --check-prefix=FOCUS_WRONG
 FOCUS_WRONG-NOT: INFO: Focus function is set to
 FOCUS_WRONG: ERROR: Failed to set focus function
 
-RUN: %t-exe -runs=100 -focus_function=f0 2>&1 | FileCheck %s --check-prefix=FOCUS_F0
+RUN: %run %t-exe -runs=100 -focus_function=f0 2>&1 | FileCheck %s --check-prefix=FOCUS_F0
 FOCUS_F0: INFO: Focus function is set to 'f0'
 FOCUS_F0: INFO: 0/1 inputs touch the focus function
 
@@ -26,6 +26,6 @@ RUN: echo ABC$(for((i=0;i<2048;i++)); do echo -n x; done) > %t-corpus/ABC
 RUN: echo AXY$(for((i=0;i<2048;i++)); do echo -n x; done) > %t-corpus/AXY
 RUN: echo ABX$(for((i=0;i<2048;i++)); do echo -n x; done) > %t-corpus/ABX
 
-RUN: %t-exe -runs=10000 -focus_function=f0 %t-corpus 2>&1 | FileCheck %s --check-prefix=CORPUS_1_3
+RUN: %run %t-exe -runs=10000 -focus_function=f0 %t-corpus 2>&1 | FileCheck %s --check-prefix=CORPUS_1_3
 CORPUS_1_3: INFO: 1/3 inputs touch the focus function
 CORPUS_1_3: DONE {{.*}} focus:
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/release-shadow.c b/compiler-rt/test/hwasan/TestCases/Linux/release-shadow.c
index c17dc84..db9df0a 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/release-shadow.c
+++ b/compiler-rt/test/hwasan/TestCases/Linux/release-shadow.c
@@ -57,7 +57,7 @@ int test_rss_difference(void *p) {
   size_t diff = rss_before - rss_after;
   fprintf(stderr, "diff %zu\n", diff);
   // Check that the difference is at least close to kNumShadowPages.
-  return diff > kNumShadowPages / 2;
+  return diff >= kNumShadowPages / 2;
 }
 
 int main() {
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
index 5c27775..06515dd 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DAP.py
@@ -361,7 +361,10 @@ class DAP(DebuggerBase, metaclass=abc.ABCMeta):
             # response or the event, since the DAP does not specify an order in which they are sent. May need revisiting
             # if there turns out to be some odd ordering issues, e.g. if we can receive messages in the order
             # ["response: continued", "event: stopped", "event: continued"].
-            if message["command"] == "continue" and message["success"] == True:
+            if (
+                message["command"] in ["continue", "stepIn", "next", "stepOut"]
+                and message["success"] == True
+            ):
                 debugger_state.is_running = True
                 # Reset all state that is invalidated upon program continue.
                 debugger_state.stopped_reason = None
@@ -677,14 +680,18 @@ class DAP(DebuggerBase, metaclass=abc.ABCMeta):
     def _post_step_hook(self):
         """Hook to be executed after completing a step request."""
 
-    def step_in(self):
+    def _step(self, step_request_string):
         self._flush_breakpoints()
         step_req_id = self.send_message(
-            self.make_request("stepIn", {"threadId": self._debugger_state.thread})
+            self.make_request(
+                step_request_string, {"threadId": self._debugger_state.thread}
+            )
         )
         response = self._await_response(step_req_id)
         if not response["success"]:
-            raise DebuggerException("failed to step")
+            raise DebuggerException(
+                f"failed to perform debugger action: '{step_request_string}'"
+            )
         # If we've "stepped" to a breakpoint, then continue to hit the breakpoint properly.
         # NB: This is an issue that only seems relevant to LLDB, but is also harmless outside of LLDB; if it turns out
         #     to cause issues for other debuggers, we can move it to a post-step hook.
@@ -692,6 +699,15 @@ class DAP(DebuggerBase, metaclass=abc.ABCMeta):
             time.sleep(0.001)
         self._post_step_hook()
 
+    def step_in(self):
+        self._step("stepIn")
+
+    def step_next(self):
+        self._step("next")
+
+    def step_out(self):
+        self._step("stepOut")
+
     def go(self) -> ReturnCode:
         self._flush_breakpoints()
         continue_req_id = self.send_message(
diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
index 0b3194d..9f71d51 100644
--- a/flang-rt/include/flang-rt/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -438,7 +438,9 @@ template <>
 class ListDirectedStatementState<Direction::Input>
     : public FormattedIoStatementState<Direction::Input> {
 public:
-  RT_API_ATTRS bool inNamelistSequence() const { return inNamelistSequence_; }
+  RT_API_ATTRS const NamelistGroup *namelistGroup() const {
+    return namelistGroup_;
+  }
   RT_API_ATTRS int EndIoStatement();
 
   // Skips value separators, handles repetition and null values.
@@ -451,18 +453,19 @@ public:
   // input statement.  This member function resets some state so that
   // repetition and null values work correctly for each successive
   // NAMELIST input item.
-  RT_API_ATTRS void ResetForNextNamelistItem(bool inNamelistSequence) {
+  RT_API_ATTRS void ResetForNextNamelistItem(
+      const NamelistGroup *namelistGroup) {
     remaining_ = 0;
     if (repeatPosition_) {
       repeatPosition_->Cancel();
     }
     eatComma_ = false;
     realPart_ = imaginaryPart_ = false;
-    inNamelistSequence_ = inNamelistSequence;
+    namelistGroup_ = namelistGroup;
   }
 
 protected:
-  bool inNamelistSequence_{false};
+  const NamelistGroup *namelistGroup_{nullptr};
 
 private:
   int remaining_{0}; // for "r*" repetition
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index 7cf4147..6aeb103 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -369,6 +369,9 @@ RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
         return status;
       }
     } else if (!toDerived_->noDestructionNeeded()) {
+      // F'2023 9.7.3.2 p7: "When an intrinsic assignment statement (10.2.1.3)
+      // is executed, any noncoarray allocated allocatable subobject of the
+      // variable is deallocated before the assignment takes place."
       if (int status{
               workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
           status != StatOk && status != StatContinue) {
diff --git a/flang-rt/lib/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp
index 80cc085..4f01623 100644
--- a/flang-rt/lib/runtime/edit-input.cpp
+++ b/flang-rt/lib/runtime/edit-input.cpp
@@ -534,7 +534,7 @@ static RT_API_ATTRS ScannedRealInput ScanRealInput(
       next = io.NextInField(remaining, edit);
     }
     if (!next || *next == ')') { // NextInField fails on separators like ')'
-      std::size_t byteCount{0};
+      std::size_t byteCount{1};
       if (!next) {
         next = io.GetCurrentChar(byteCount);
       }
diff --git a/flang-rt/lib/runtime/io-stmt.cpp b/flang-rt/lib/runtime/io-stmt.cpp
index af44a9d..e08088f 100644
--- a/flang-rt/lib/runtime/io-stmt.cpp
+++ b/flang-rt/lib/runtime/io-stmt.cpp
@@ -1086,7 +1086,7 @@ ChildListIoStatementState<DIR>::ChildListIoStatementState(
   if constexpr (DIR == Direction::Input) {
     if (auto *listInput{child.parent()
                 .get_if<ListDirectedStatementState<Direction::Input>>()}) {
-      this->inNamelistSequence_ = listInput->inNamelistSequence();
+      this->namelistGroup_ = listInput->namelistGroup();
     }
   }
 #else
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index cbc3226..44a8fe2 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -44,8 +44,7 @@ bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
     if ((connection.NeedAdvance(prefixLen) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1))) ||
         !EmitAscii(io, prefix, prefixLen) ||
-        (connection.NeedAdvance(
-             Fortran::runtime::strlen(str) + (suffix != ' ')) &&
+        (connection.NeedAdvance(runtime::strlen(str) + (suffix != ' ')) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1)))) {
       return false;
     }
@@ -102,8 +101,8 @@ static constexpr RT_API_ATTRS char NormalizeIdChar(char32_t ch) {
   return static_cast<char>(ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch);
 }
 
-static RT_API_ATTRS bool GetLowerCaseName(
-    IoStatementState &io, char buffer[], std::size_t maxLength) {
+static RT_API_ATTRS bool GetLowerCaseName(IoStatementState &io, char buffer[],
+    std::size_t maxLength, bool crashIfTooLong = true) {
   std::size_t byteLength{0};
   if (auto ch{io.GetNextNonBlank(byteLength)}) {
     if (IsLegalIdStart(*ch)) {
@@ -117,8 +116,10 @@ static RT_API_ATTRS bool GetLowerCaseName(
       if (j <= maxLength) {
         return true;
       }
-      io.GetIoErrorHandler().SignalError(
-          "Identifier '%s...' in NAMELIST input group is too long", buffer);
+      if (crashIfTooLong) {
+        io.GetIoErrorHandler().SignalError(
+            "Identifier '%s...' in NAMELIST input group is too long", buffer);
+      }
     }
   }
   return false;
@@ -356,9 +357,8 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
     const DescriptorAddendum *addendum{source.Addendum()};
     if (const typeInfo::DerivedType *
         type{addendum ? addendum->derivedType() : nullptr}) {
-      if (const typeInfo::Component *
-          comp{type->FindDataComponent(
-              compName, Fortran::runtime::strlen(compName))}) {
+      if (const typeInfo::Component *comp{
+              type->FindDataComponent(compName, runtime::strlen(compName))}) {
         bool createdDesc{false};
         if (comp->rank() > 0 && source.rank() > 0) {
           // If base and component are both arrays, the component name
@@ -484,7 +484,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       handler.SignalError("NAMELIST input group has no name");
       return false;
     }
-    if (Fortran::runtime::strcmp(group.groupName, name) == 0) {
+    if (runtime::strcmp(group.groupName, name) == 0) {
       break; // found it
     }
     SkipNamelistGroup(io);
@@ -503,7 +503,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     }
     std::size_t itemIndex{0};
     for (; itemIndex < group.items; ++itemIndex) {
-      if (Fortran::runtime::strcmp(name, group.item[itemIndex].name) == 0) {
+      if (runtime::strcmp(name, group.item[itemIndex].name) == 0) {
         break;
       }
     }
@@ -577,13 +577,14 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     if (const auto *addendum{useDescriptor->Addendum()};
         addendum && addendum->derivedType()) {
       const NonTbpDefinedIoTable *table{group.nonTbpDefinedIo};
-      listInput->ResetForNextNamelistItem(/*inNamelistSequence=*/true);
+      listInput->ResetForNextNamelistItem(&group);
       if (!IONAME(InputDerivedType)(cookie, *useDescriptor, table) &&
           handler.InError()) {
         return false;
       }
     } else {
-      listInput->ResetForNextNamelistItem(useDescriptor->rank() > 0);
+      listInput->ResetForNextNamelistItem(
+          useDescriptor->rank() > 0 ? &group : nullptr);
       if (!descr::DescriptorIO<Direction::Input>(io, *useDescriptor) &&
           handler.InError()) {
         return false;
@@ -607,27 +608,51 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
 }
 
 RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &io) {
-  if (auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
-    if (listInput->inNamelistSequence()) {
-      SavedPosition savedPosition{io};
-      std::size_t byteCount{0};
-      if (auto ch{io.GetNextNonBlank(byteCount)}) {
-        if (IsLegalIdStart(*ch)) {
-          do {
-            io.HandleRelativePosition(byteCount);
-            ch = io.GetCurrentChar(byteCount);
-          } while (ch && IsLegalIdChar(*ch));
-          ch = io.GetNextNonBlank(byteCount);
-          // TODO: how to deal with NaN(...) ambiguity?
-          return ch && (*ch == '=' || *ch == '(' || *ch == '%');
-        } else {
-          return *ch == '/' || *ch == '&' || *ch == '$';
-        }
-      }
+  auto *listInput{io.get_if<ListDirectedStatementState<Direction::Input>>()};
+  if (!listInput || !listInput->namelistGroup()) {
+    return false; // not namelist
+  }
+  SavedPosition savedPosition{io};
+  std::size_t byteCount{0};
+  auto ch{io.GetNextNonBlank(byteCount)};
+  if (!ch) {
+    return false;
+  } else if (!IsLegalIdStart(*ch)) {
+    return *ch == '/' || *ch == '&' || *ch == '$';
+  }
+  char id[nameBufferSize];
+  if (!GetLowerCaseName(io, id, sizeof id, /*crashIfTooLong=*/false)) {
+    return true; // long name
+  }
+  // It looks like a name, but might be "inf" or "nan".  Check what
+  // follows.
+  ch = io.GetNextNonBlank(byteCount);
+  if (!ch) {
+    return false;
+  } else if (*ch == '=' || *ch == '%') {
+    return true;
+  } else if (*ch != '(') {
+    return false;
+  } else if (runtime::strcmp(id, "nan") != 0) {
+    return true;
+  }
+  // "nan(" ambiguity
+  int depth{1};
+  while (true) {
+    io.HandleRelativePosition(byteCount);
+    ch = io.GetNextNonBlank(byteCount);
+    if (depth == 0) {
+      // nan(...) followed by '=', '%', or '('?
+      break;
+    } else if (!ch) {
+      return true; // not a valid NaN(...)
+    } else if (*ch == '(') {
+      ++depth;
+    } else if (*ch == ')') {
+      --depth;
     }
   }
-  return false;
+  return ch && (*ch == '=' || *ch == '%' || *ch == '(');
 }
 
 RT_OFFLOAD_API_GROUP_END
diff --git a/flang-rt/test/Driver/ctofortran.f90 b/flang-rt/test/Driver/ctofortran.f90
index e385e79..4979a4e 100644
--- a/flang-rt/test/Driver/ctofortran.f90
+++ b/flang-rt/test/Driver/ctofortran.f90
@@ -2,8 +2,8 @@
 ! UNSUPPORTED: offload-cuda
 
 ! RUN: split-file %s %t
-! RUN: %clang -I"%include/flang" -c %t/cfile.c -o %t/cfile.o
-! RUN: %flang -L"%libdir" %t/ffile.f90 %t/cfile.o -o %t/ctofortran
+! RUN: %clang %isysroot -I"%include/flang" -c %t/cfile.c -o %t/cfile.o
+! RUN: %flang %isysroot -L"%libdir" %t/ffile.f90 %t/cfile.o -o %t/ctofortran
 ! RUN: env LD_LIBRARY_PATH="$LD_LIBRARY_PATH:%libdir" %t/ctofortran | FileCheck %s
 
 !--- ffile.f90
diff --git a/flang-rt/test/Runtime/no-cpp-dep.c b/flang-rt/test/Runtime/no-cpp-dep.c
index c98678b6..006f44b 100644
--- a/flang-rt/test/Runtime/no-cpp-dep.c
+++ b/flang-rt/test/Runtime/no-cpp-dep.c
@@ -7,7 +7,7 @@ UNSUPPORTED: system-windows
 UNSUPPORTED: offload-cuda
 
 RUN: %if system-aix %{ export OBJECT_MODE=64 %}
-RUN: %cc -std=c99 %s -I%include -L"%libdir" -lflang_rt.runtime -lm \
+RUN: %cc -std=c99 %s %isysroot -I%include -L"%libdir" -lflang_rt.runtime -lm \
 RUN: %if system-aix %{-lpthread %}
 RUN: rm a.out
 */
diff --git a/flang-rt/test/lit.cfg.py b/flang-rt/test/lit.cfg.py
index 032aeef..27f4666 100644
--- a/flang-rt/test/lit.cfg.py
+++ b/flang-rt/test/lit.cfg.py
@@ -62,25 +62,24 @@ config.test_source_root = os.path.dirname(__file__)
 # lit writes a '.lit_test_times.txt' file into this directory.
 config.test_exec_root = config.flang_rt_binary_test_dir
 
-# On MacOS, -isysroot is needed to build binaries.
+# On MacOS, some tests need -isysroot to build binaries.
 isysroot_flag = []
 if config.osx_sysroot:
     isysroot_flag = ["-isysroot", config.osx_sysroot]
+config.substitutions.append(("%isysroot", " ".join(isysroot_flag)))
 
 tools = [
     ToolSubst(
         "%flang",
         command=config.flang,
-        extra_args=isysroot_flag,
         unresolved="fatal",
     ),
     ToolSubst(
         "%clang",
         command=FindTool("clang"),
-        extra_args=isysroot_flag,
         unresolved="fatal",
     ),
-    ToolSubst("%cc", command=config.cc, extra_args=isysroot_flag, unresolved="fatal"),
+    ToolSubst("%cc", command=config.cc, unresolved="fatal"),
 ]
 llvm_config.add_tool_substitutions(tools)
 
diff --git a/flang-rt/unittests/Runtime/Namelist.cpp b/flang-rt/unittests/Runtime/Namelist.cpp
index ee4018e..f190bea 100644
--- a/flang-rt/unittests/Runtime/Namelist.cpp
+++ b/flang-rt/unittests/Runtime/Namelist.cpp
@@ -334,4 +334,35 @@ TEST(NamelistTests, RealValueForInt) {
   EXPECT_EQ(got, expect);
 }
 
+TEST(NamelistTests, NanInputAmbiguity) {
+  OwningPtr<Descriptor> xDesc{// real :: x(5) = 0.
+      MakeArray<TypeCategory::Real, static_cast<int>(sizeof(float))>(
+          std::vector<int>{5}, std::vector<float>{{0, 0, 0, 0, 0}})};
+  OwningPtr<Descriptor> nanDesc{// real :: nan(2) = 0.
+      MakeArray<TypeCategory::Real, static_cast<int>(sizeof(float))>(
+          std::vector<int>{2}, std::vector<float>{{0, 0}})};
+  const NamelistGroup::Item items[]{{"x", *xDesc}, {"nan", *nanDesc}};
+  const NamelistGroup group{"nml", 2, items};
+  static char t1[]{"&nml x=1 2 nan(q) 4 nan(1)=5 nan(q)/"};
+  StaticDescriptor<1, true> statDesc;
+  Descriptor &internalDesc{statDesc.descriptor()};
+  internalDesc.Establish(TypeCode{CFI_type_char},
+      /*elementBytes=*/std::strlen(t1), t1, 0, nullptr, CFI_attribute_pointer);
+  auto inCookie{IONAME(BeginInternalArrayListInput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
+  ASSERT_TRUE(IONAME(InputNamelist)(inCookie, group));
+  ASSERT_EQ(IONAME(EndIoStatement)(inCookie), IostatOk)
+      << "namelist real input for nans";
+  char out[40];
+  internalDesc.Establish(TypeCode{CFI_type_char}, /*elementBytes=*/sizeof out,
+      out, 0, nullptr, CFI_attribute_pointer);
+  auto outCookie{IONAME(BeginInternalArrayListOutput)(
+      internalDesc, nullptr, 0, __FILE__, __LINE__)};
+  ASSERT_TRUE(IONAME(OutputNamelist)(outCookie, group));
+  ASSERT_EQ(IONAME(EndIoStatement)(outCookie), IostatOk) << "namelist output";
+  std::string got{out, sizeof out};
+  static const std::string expect{" &NML X= 1. 2. NaN 4. 0.,NAN= 5. NaN/   "};
+  EXPECT_EQ(got, expect);
+}
+
 // TODO: Internal NAMELIST error tests
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 1d16c33..c01eb56 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -495,6 +495,9 @@ endif()
 
 include(AddFlang)
 include(FlangCommon)
+include(GetClangResourceDir)
+
+get_clang_resource_dir(HEADER_BINARY_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. SUBDIR include)
 
 if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
@@ -575,8 +578,6 @@ endif()
 
 # Put ISO_Fortran_binding.h into the include files of the build area now
 # so that we can run tests before installing
-include(GetClangResourceDir)
-get_clang_resource_dir(HEADER_BINARY_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. SUBDIR include)
 configure_file(
   ${FLANG_SOURCE_DIR}/include/flang/ISO_Fortran_binding.h
   ${HEADER_BINARY_DIR}/ISO_Fortran_binding.h COPYONLY)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 11c6717..b20503e 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -900,6 +900,16 @@ print *, [(j,j=1,10)]
   since these default values need to be available to process incomplete
   structure constructors.
 
+* When an `ALLOCATE` or `DEALLOCATE` statement with multiple variables
+  has a `STAT=` specifier that allows the program to continue execution
+  after an error, the variables after the one with the error are left
+  deallocated (or allocated).  This interpretation allows the program to
+  identify the variable that encountered the problem while avoiding any
+  ambiguity in the case of multiple errors with distinct status codes.
+  Some compilers work differently; for maximum portability, avoid
+  `ALLOCATE` and `DEALLOCATE` statements with error recovery for
+  multiple variables.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Common/enum-set.h b/flang/include/flang/Common/enum-set.h
index 5290b76..e048c66 100644
--- a/flang/include/flang/Common/enum-set.h
+++ b/flang/include/flang/Common/enum-set.h
@@ -175,10 +175,8 @@ public:
   constexpr bool empty() const { return none(); }
   void clear() { reset(); }
   void insert(enumerationType x) { set(x); }
-  void insert(enumerationType &&x) { set(x); }
-  void emplace(enumerationType &&x) { set(x); }
+  void emplace(enumerationType x) { set(x); }
   void erase(enumerationType x) { reset(x); }
-  void erase(enumerationType &&x) { reset(x); }
 
   constexpr std::optional<enumerationType> LeastElement() const {
     if (empty()) {
diff --git a/flang/include/flang/Evaluate/check-expression.h b/flang/include/flang/Evaluate/check-expression.h
index 0cf12f3..eb15265 100644
--- a/flang/include/flang/Evaluate/check-expression.h
+++ b/flang/include/flang/Evaluate/check-expression.h
@@ -64,6 +64,13 @@ bool IsInitialProcedureTarget(const Symbol &);
 bool IsInitialProcedureTarget(const ProcedureDesignator &);
 bool IsInitialProcedureTarget(const Expr<SomeType> &);
 
+// Emit warnings about default REAL literal constants in contexts that
+// will be converted to a higher precision REAL kind than the default.
+void CheckRealWidening(
+    const Expr<SomeType> &, const DynamicType &toType, FoldingContext &);
+void CheckRealWidening(const Expr<SomeType> &,
+    const std::optional<DynamicType> &, FoldingContext &);
+
 // Validate the value of a named constant, the static initial
 // value of a non-pointer non-allocatable non-dummy variable, or the
 // default initializer of a component of a derived type (or instantiation
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index d4c6601..9ae37cd 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -128,17 +128,19 @@ public:
   bool empty() const { return values_.empty(); }
   std::size_t size() const { return values_.size(); }
   const std::vector<Element> &values() const { return values_; }
-  constexpr Result result() const { return result_; }
+  Result &result() { return result_; }
+  const Result &result() const { return result_; }
 
   constexpr DynamicType GetType() const { return result_.GetType(); }
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
+  std::string AsFortran() const;
 
 protected:
   std::vector<Element> Reshape(const ConstantSubscripts &) const;
   std::size_t CopyFrom(const ConstantBase &source, std::size_t count,
       ConstantSubscripts &resultSubscripts, const std::vector<int> *dimOrder);
 
-  Result result_;
+  Result result_; // usually empty except for Real & Complex
   std::vector<Element> values_;
 };
 
@@ -209,6 +211,7 @@ public:
 
   Constant Reshape(ConstantSubscripts &&) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
+  std::string AsFortran() const;
   DynamicType GetType() const { return {KIND, length_}; }
   std::size_t CopyFrom(const Constant &source, std::size_t count,
       ConstantSubscripts &resultSubscripts, const std::vector<int> *dimOrder);
diff --git a/flang/include/flang/Evaluate/match.h b/flang/include/flang/Evaluate/match.h
new file mode 100644
index 0000000..79da40f7
--- /dev/null
+++ b/flang/include/flang/Evaluate/match.h
@@ -0,0 +1,211 @@
+//===-- include/flang/Evaluate/match.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_EVALUATE_MATCH_H_
+#define FORTRAN_EVALUATE_MATCH_H_
+
+#include "flang/Common/visit.h"
+#include "flang/Evaluate/expression.h"
+#include "llvm/ADT/STLExtras.h"
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+namespace Fortran::evaluate {
+namespace match {
+namespace detail {
+template <typename, typename = void> //
+struct IsOperation {
+  static constexpr bool value{false};
+};
+
+template <typename T>
+struct IsOperation<T, std::void_t<decltype(T::operands)>> {
+  static constexpr bool value{true};
+};
+} // namespace detail
+
+template <typename T>
+constexpr bool is_operation_v{detail::IsOperation<T>::value};
+
+template <typename T>
+const evaluate::Expr<T> &deparen(const evaluate::Expr<T> &x) {
+  if (auto *parens{std::get_if<evaluate::Parentheses<T>>(&x.u)}) {
+    return deparen(parens->template operand<0>());
+  } else {
+    return x;
+  }
+}
+
+// Expr<T> matchers (patterns)
+//
+// Each pattern should implement
+//   bool match(const U &input) const
+// member function that returns `true` when the match was successful,
+// and `false` otherwise.
+//
+// Patterns are intended to be composable, i.e. a pattern can take operands
+// which themselves are patterns. This composition is expected to match if
+// the root pattern and all its operands match given input.
+
+/// Matches any input as long as it has the expected type `MatchType`.
+/// Additionally, it sets the member `ref` to the matched input.
+template <typename T> struct TypePattern {
+  using MatchType = llvm::remove_cvref_t<T>;
+
+  template <typename U> bool match(const U &input) const {
+    if constexpr (std::is_same_v<MatchType, U>) {
+      ref = &input;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  mutable const MatchType *ref{nullptr};
+};
+
+/// Matches one of the patterns provided as template arguments. All of these
+/// patterns should have the same number of operands, i.e. they all should
+/// try to match input expression with the same number of children, i.e.
+/// AnyOfPattern<SomeBinaryOp, OtherBinaryOp> is ok, whereas
+/// AnyOfPattern<SomeBinaryOp, SomeTernaryOp> is not.
+template <typename... Patterns> struct AnyOfPattern {
+  static_assert(sizeof...(Patterns) != 0);
+
+private:
+  using PatternTuple = std::tuple<Patterns...>;
+
+  template <size_t I>
+  using Pattern = typename std::tuple_element<I, PatternTuple>::type;
+
+  template <size_t... Is, typename... Ops>
+  AnyOfPattern(std::index_sequence<Is...>, const Ops &...ops)
+      : patterns(std::make_tuple(Pattern<Is>(ops...)...)) {}
+
+  template <typename P, typename U>
+  bool matchOne(const P &pattern, const U &input) const {
+    if (pattern.match(input)) {
+      ref = &pattern;
+      return true;
+    }
+    return false;
+  }
+
+  template <typename U, size_t... Is>
+  bool matchImpl(const U &input, std::index_sequence<Is...>) const {
+    return (matchOne(std::get<Is>(patterns), input) || ...);
+  }
+
+  PatternTuple patterns;
+
+public:
+  using Indexes = std::index_sequence_for<Patterns...>;
+  using MatchTypes = std::tuple<typename Patterns::MatchType...>;
+
+  template <typename... Ops>
+  AnyOfPattern(const Ops &...ops) : AnyOfPattern(Indexes{}, ops...) {}
+
+  template <typename U> bool match(const U &input) const {
+    return matchImpl(input, Indexes{});
+  }
+
+  mutable std::variant<const Patterns *..., std::monostate> ref{
+      std::monostate{}};
+};
+
+/// Matches any input of type Expr<T>
+/// The indent if this pattern is to be a leaf in multi-operand patterns.
+template <typename T> //
+struct ExprPattern : public TypePattern<evaluate::Expr<T>> {};
+
+/// Matches evaluate::Expr<T> that contains evaluate::Opreration<OpType>.
+template <typename OpType, typename... Ops>
+struct OperationPattern : public TypePattern<OpType> {
+private:
+  using Indexes = std::index_sequence_for<Ops...>;
+
+  template <typename S, size_t... Is>
+  bool matchImpl(const S &op, std::index_sequence<Is...>) const {
+    using TypeS = llvm::remove_cvref_t<S>;
+    if constexpr (is_operation_v<TypeS>) {
+      if constexpr (TypeS::operands == Indexes::size()) {
+        return TypePattern<OpType>::match(op) &&
+            (std::get<Is>(operands).match(op.template operand<Is>()) && ...);
+      }
+    }
+    return false;
+  }
+
+  std::tuple<const Ops &...> operands;
+
+public:
+  using MatchType = OpType;
+
+  OperationPattern(const Ops &...ops, llvm::type_identity<OpType> = {})
+      : operands(ops...) {}
+
+  template <typename T> bool match(const evaluate::Expr<T> &input) const {
+    return common::visit(
+        [&](auto &&s) { return matchImpl(s, Indexes{}); }, deparen(input).u);
+  }
+
+  template <typename U> bool match(const U &input) const {
+    // Only match Expr<T>
+    return false;
+  }
+};
+
+template <typename OpType, typename... Ops>
+OperationPattern(const Ops &...ops, llvm::type_identity<OpType>)
+    -> OperationPattern<OpType, Ops...>;
+
+// Namespace-level definitions
+
+template <typename T> using Expr = ExprPattern<T>;
+
+template <typename OpType, typename... Ops>
+using Op = OperationPattern<OpType, Ops...>;
+
+template <typename Pattern, typename Input>
+bool match(const Pattern &pattern, const Input &input) {
+  return pattern.match(input);
+}
+
+// Specific operation patterns
+
+// -- Add
+template <typename Type, typename Op0, typename Op1>
+struct Add : public Op<evaluate::Add<Type>, Op0, Op1> {
+  using Base = Op<evaluate::Add<Type>, Op0, Op1>;
+
+  Add(const Op0 &op0, const Op1 &op1) : Base(op0, op1) {}
+};
+
+template <typename Type, typename Op0, typename Op1>
+Add<Type, Op0, Op1> add(const Op0 &op0, const Op1 &op1) {
+  return Add<Type, Op0, Op1>(op0, op1);
+}
+
+// -- Mul
+template <typename Type, typename Op0, typename Op1>
+struct Mul : public Op<evaluate::Multiply<Type>, Op0, Op1> {
+  using Base = Op<evaluate::Multiply<Type>, Op0, Op1>;
+
+  Mul(const Op0 &op0, const Op1 &op1) : Base(op0, op1) {}
+};
+
+template <typename Type, typename Op0, typename Op1>
+Mul<Type, Op0, Op1> mul(const Op0 &op0, const Op1 &op1) {
+  return Mul<Type, Op0, Op1>(op0, op1);
+}
+} // namespace match
+} // namespace Fortran::evaluate
+
+#endif // FORTRAN_EVALUATE_MATCH_H_
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index 76d25d9..dcd7407 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -442,6 +442,7 @@ public:
   // or parenthesized constant expression that produces this value.
   llvm::raw_ostream &AsFortran(
       llvm::raw_ostream &, int kind, bool minimal = false) const;
+  std::string AsFortran(int kind, bool minimal = false) const;
 
 private:
   using Significand = Integer<significandBits>; // no implicit bit
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index f3bba77..222018b 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -274,9 +274,26 @@ public:
   using Scalar = value::Integer<8 * KIND>;
 };
 
+// Records when a default REAL literal constant is inexactly converted to binary
+// (e.g., 0.1 but not 0.125) to enable a usage warning if the expression in
+// which it appears undergoes an implicit widening conversion.
+class TrackInexactLiteralConversion {
+public:
+  constexpr bool isFromInexactLiteralConversion() const {
+    return isFromInexactLiteralConversion_;
+  }
+  void set_isFromInexactLiteralConversion(bool yes = true) {
+    isFromInexactLiteralConversion_ = yes;
+  }
+
+private:
+  bool isFromInexactLiteralConversion_{false};
+};
+
 template <int KIND>
 class Type<TypeCategory::Real, KIND>
-    : public TypeBase<TypeCategory::Real, KIND> {
+    : public TypeBase<TypeCategory::Real, KIND>,
+      public TrackInexactLiteralConversion {
 public:
   static constexpr int precision{common::PrecisionOfRealKind(KIND)};
   static constexpr int bits{common::BitsForBinaryPrecision(precision)};
@@ -289,7 +306,8 @@ public:
 // The KIND type parameter on COMPLEX is the kind of each of its components.
 template <int KIND>
 class Type<TypeCategory::Complex, KIND>
-    : public TypeBase<TypeCategory::Complex, KIND> {
+    : public TypeBase<TypeCategory::Complex, KIND>,
+      public TrackInexactLiteralConversion {
 public:
   using Part = Type<TypeCategory::Real, KIND>;
   using Scalar = value::Complex<typename Part::Scalar>;
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 6c2e6d7..4a831fd 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -62,6 +62,8 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
+bool isTransferWithConversion(mlir::Value rhs);
+
 } // end namespace Fortran::lower
 
 #endif // FORTRAN_LOWER_CUDA_H
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 704faf0..e06289c 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -112,4 +112,9 @@ def GenericLoopConversionPass
   ];
 }
 
+def SimdOnlyPass : Pass<"omp-simd-only", "mlir::ModuleOp"> {
+  let summary = "Filters out non-simd OpenMP constructs";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index a3f59ee..fd8c43c 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -119,13 +119,16 @@ void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config);
 void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
                                            MLIRToLLVMPassPipelineConfig &pc);
 
+/// Select which mode to enable OpenMP support in.
+enum class EnableOpenMP { None, Simd, Full };
+
 /// Create a pass pipeline for lowering from HLFIR to FIR
 ///
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 void createHLFIRToFIRPassPipeline(
-    mlir::PassManager &pm, bool enableOpenMP,
+    mlir::PassManager &pm, EnableOpenMP enableOpenMP,
     llvm::OptimizationLevel optLevel = defaultOptLevel);
 
 struct OpenMPFIRPassPipelineOpts {
diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index dd610c9..cc66cc83 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -401,6 +401,22 @@ static const OmpDirectiveSet nestedWorkshareErrSet{
         Directive::OMPD_taskloop,
     } | workShareSet,
 };
+
+//===----------------------------------------------------------------------===//
+// Misc directive sets
+//===----------------------------------------------------------------------===//
+
+// Simple standalone directives than can be erased by -fopenmp-simd.
+static const OmpDirectiveSet simpleStandaloneNonSimdOnlySet{
+    Directive::OMPD_taskyield,
+    Directive::OMPD_barrier,
+    Directive::OMPD_ordered,
+    Directive::OMPD_target_enter_data,
+    Directive::OMPD_target_exit_data,
+    Directive::OMPD_target_update,
+    Directive::OMPD_taskwait,
+};
+
 } // namespace llvm::omp
 
 #endif // FORTRAN_SEMANTICS_OPENMP_DIRECTIVE_SETS_H_
diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index 743abf6..bd3ff4a 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -78,7 +78,8 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation,
     CompatibleDeclarationsFromDistinctModules,
     NullActualForDefaultIntentAllocatable, UseAssociationIntoSameNameSubprogram,
-    HostAssociatedIntentOutInSpecExpr, NonVolatilePointerToVolatile)
+    HostAssociatedIntentOutInSpecExpr, NonVolatilePointerToVolatile,
+    RealConstantWidening)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index d5bf7a2..ba72d7b 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -58,6 +58,8 @@ LANGOPT(OpenMPTeamSubscription, 1, 0)
 LANGOPT(OpenMPNoThreadState, 1, 0)
 /// Assume that no thread in a parallel region will encounter a parallel region
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
+/// Use SIMD only OpenMP support.
+LANGOPT(OpenMPSimd, 1, false)
 
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index df1da27..51958fa 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -134,6 +134,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
                                       ///< functions.
   bool NSWOnLoopVarInc = true; ///< Add nsw flag to loop variable increments.
   bool EnableOpenMP = false; ///< Enable OpenMP lowering.
+  bool EnableOpenMPSimd = false; ///< Enable OpenMP simd-only mode.
   std::string InstrumentFunctionEntry =
       ""; ///< Name of the instrument-function that is called on each
           ///< function-entry
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 3d7f01d..522ab19 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -405,6 +405,88 @@ bool IsInitialProcedureTarget(const Expr<SomeType> &expr) {
   }
 }
 
+class SuspiciousRealLiteralFinder
+    : public AnyTraverse<SuspiciousRealLiteralFinder> {
+public:
+  using Base = AnyTraverse<SuspiciousRealLiteralFinder>;
+  SuspiciousRealLiteralFinder(int kind, FoldingContext &c)
+      : Base{*this}, kind_{kind}, context_{c} {}
+  using Base::operator();
+  template <int KIND>
+  bool operator()(const Constant<Type<TypeCategory::Real, KIND>> &x) const {
+    if (kind_ > KIND && x.result().isFromInexactLiteralConversion()) {
+      context_.messages().Say(common::UsageWarning::RealConstantWidening,
+          "Default real literal in REAL(%d) context might need a kind suffix, as its rounded value %s is inexact"_warn_en_US,
+          kind_, x.AsFortran());
+      return true;
+    } else {
+      return false;
+    }
+  }
+  template <int KIND>
+  bool operator()(const Constant<Type<TypeCategory::Complex, KIND>> &x) const {
+    if (kind_ > KIND && x.result().isFromInexactLiteralConversion()) {
+      context_.messages().Say(common::UsageWarning::RealConstantWidening,
+          "Default real literal in COMPLEX(%d) context might need a kind suffix, as its rounded value %s is inexact"_warn_en_US,
+          kind_, x.AsFortran());
+      return true;
+    } else {
+      return false;
+    }
+  }
+  template <TypeCategory TOCAT, int TOKIND, TypeCategory FROMCAT>
+  bool operator()(const Convert<Type<TOCAT, TOKIND>, FROMCAT> &x) const {
+    if constexpr ((TOCAT == TypeCategory::Real ||
+                      TOCAT == TypeCategory::Complex) &&
+        (FROMCAT == TypeCategory::Real || FROMCAT == TypeCategory::Complex)) {
+      auto fromType{x.left().GetType()};
+      if (!fromType || fromType->kind() < TOKIND) {
+        return false;
+      }
+    }
+    return (*this)(x.left());
+  }
+
+private:
+  int kind_;
+  FoldingContext &context_;
+};
+
+void CheckRealWidening(const Expr<SomeType> &expr, const DynamicType &toType,
+    FoldingContext &context) {
+  if (toType.category() == TypeCategory::Real ||
+      toType.category() == TypeCategory::Complex) {
+    if (auto fromType{expr.GetType()}) {
+      if ((fromType->category() == TypeCategory::Real ||
+              fromType->category() == TypeCategory::Complex) &&
+          toType.kind() > fromType->kind()) {
+        SuspiciousRealLiteralFinder{toType.kind(), context}(expr);
+      }
+    }
+  }
+}
+
+void CheckRealWidening(const Expr<SomeType> &expr,
+    const std::optional<DynamicType> &toType, FoldingContext &context) {
+  if (toType) {
+    CheckRealWidening(expr, *toType, context);
+  }
+}
+
+class InexactLiteralConversionFlagClearer
+    : public AnyTraverse<InexactLiteralConversionFlagClearer> {
+public:
+  using Base = AnyTraverse<InexactLiteralConversionFlagClearer>;
+  InexactLiteralConversionFlagClearer() : Base(*this) {}
+  using Base::operator();
+  template <int KIND>
+  bool operator()(const Constant<Type<TypeCategory::Real, KIND>> &x) const {
+    auto &mut{const_cast<Type<TypeCategory::Real, KIND> &>(x.result())};
+    mut.set_isFromInexactLiteralConversion(false);
+    return false;
+  }
+};
+
 // Converts, folds, and then checks type, rank, and shape of an
 // initialization expression for a named constant, a non-pointer
 // variable static initialization, a component default initializer,
@@ -416,6 +498,7 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
   if (auto symTS{
           characteristics::TypeAndShape::Characterize(symbol, context)}) {
     auto xType{x.GetType()};
+    CheckRealWidening(x, symTS->type(), context);
     auto converted{ConvertToType(symTS->type(), Expr<SomeType>{x})};
     if (!converted &&
         symbol.owner().context().IsEnabled(
@@ -433,6 +516,7 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
     if (converted) {
       auto folded{Fold(context, std::move(*converted))};
       if (IsActuallyConstant(folded)) {
+        InexactLiteralConversionFlagClearer{}(folded);
         int symRank{symTS->Rank()};
         if (IsImpliedShape(symbol)) {
           if (folded.Rank() == symRank) {
diff --git a/flang/lib/Evaluate/fold-complex.cpp b/flang/lib/Evaluate/fold-complex.cpp
index 3eb8e1f..bcaede5 100644
--- a/flang/lib/Evaluate/fold-complex.cpp
+++ b/flang/lib/Evaluate/fold-complex.cpp
@@ -83,12 +83,21 @@ Expr<Type<TypeCategory::Complex, KIND>> FoldOperation(
   if (auto array{ApplyElementwise(context, x)}) {
     return *array;
   }
-  using Result = Type<TypeCategory::Complex, KIND>;
+  using ComplexType = Type<TypeCategory::Complex, KIND>;
   if (auto folded{OperandsAreConstants(x)}) {
-    return Expr<Result>{
-        Constant<Result>{Scalar<Result>{folded->first, folded->second}}};
+    using RealType = typename ComplexType::Part;
+    Constant<ComplexType> result{
+        Scalar<ComplexType>{folded->first, folded->second}};
+    if (const auto *re{UnwrapConstantValue<RealType>(x.left())};
+        re && re->result().isFromInexactLiteralConversion()) {
+      result.result().set_isFromInexactLiteralConversion();
+    } else if (const auto *im{UnwrapConstantValue<RealType>(x.right())};
+        im && im->result().isFromInexactLiteralConversion()) {
+      result.result().set_isFromInexactLiteralConversion();
+    }
+    return Expr<ComplexType>{std::move(result)};
   }
-  return Expr<Result>{std::move(x)};
+  return Expr<ComplexType>{std::move(x)};
 }
 
 #ifdef _MSC_VER // disable bogus warning about missing definitions
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 52e954d..7c80d76 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1321,8 +1321,8 @@ public:
               *charLength_, std::move(elements_), ConstantSubscripts{n}}};
         }
       } else {
-        return Expr<T>{
-            Constant<T>{std::move(elements_), ConstantSubscripts{n}}};
+        return Expr<T>{Constant<T>{
+            std::move(elements_), ConstantSubscripts{n}, resultInfo_}};
       }
     }
     return Expr<T>{std::move(array)};
@@ -1343,6 +1343,11 @@ private:
         if (!knownCharLength_) {
           charLength_ = std::max(c->LEN(), charLength_.value_or(-1));
         }
+      } else if constexpr (T::category == TypeCategory::Real ||
+          T::category == TypeCategory::Complex) {
+        if (c->result().isFromInexactLiteralConversion()) {
+          resultInfo_.set_isFromInexactLiteralConversion();
+        }
       }
       return true;
     } else {
@@ -1395,6 +1400,7 @@ private:
   std::vector<Scalar<T>> elements_;
   std::optional<ConstantSubscript> charLength_;
   bool knownCharLength_{false};
+  typename Constant<T>::Result resultInfo_;
 };
 
 template <typename T>
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 121afc6..ec5dc0b 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -98,6 +98,14 @@ llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
   return o;
 }
 
+template <typename RESULT, typename VALUE>
+std::string ConstantBase<RESULT, VALUE>::AsFortran() const {
+  std::string result;
+  llvm::raw_string_ostream sstream(result);
+  AsFortran(sstream);
+  return result;
+}
+
 template <int KIND>
 llvm::raw_ostream &Constant<Type<TypeCategory::Character, KIND>>::AsFortran(
     llvm::raw_ostream &o) const {
@@ -126,6 +134,14 @@ llvm::raw_ostream &Constant<Type<TypeCategory::Character, KIND>>::AsFortran(
   return o;
 }
 
+template <int KIND>
+std::string Constant<Type<TypeCategory::Character, KIND>>::AsFortran() const {
+  std::string result;
+  llvm::raw_string_ostream sstream(result);
+  AsFortran(sstream);
+  return result;
+}
+
 llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const Symbol &symbol,
     std::optional<parser::CharBlock> name = std::nullopt) {
   const auto &renamings{symbol.owner().context().moduleFileOutputRenamings()};
diff --git a/flang/lib/Evaluate/real.cpp b/flang/lib/Evaluate/real.cpp
index 2c0f283..6e6b9f3 100644
--- a/flang/lib/Evaluate/real.cpp
+++ b/flang/lib/Evaluate/real.cpp
@@ -750,6 +750,14 @@ llvm::raw_ostream &Real<W, P>::AsFortran(
   return o;
 }
 
+template <typename W, int P>
+std::string Real<W, P>::AsFortran(int kind, bool minimal) const {
+  std::string result;
+  llvm::raw_string_ostream sstream(result);
+  AsFortran(sstream, kind, minimal);
+  return result;
+}
+
 // 16.9.180
 template <typename W, int P> Real<W, P> Real<W, P>::RRSPACING() const {
   if (IsNotANumber()) {
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 111c5aa4..708fb7f 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1162,8 +1162,15 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
                                         clang::driver::options::OPT_fno_openmp);
-  if (!arg || arg->getOption().matches(clang::driver::options::OPT_fno_openmp))
-    return true;
+  if (!arg ||
+      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
+    bool isSimdSpecified = args.hasFlag(
+        clang::driver::options::OPT_fopenmp_simd,
+        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
+    if (!isSimdSpecified)
+      return true;
+    res.getLangOpts().OpenMPSimd = 1;
+  }
 
   unsigned numErrorsBefore = diags.getNumErrors();
   llvm::Triple t(res.getTargetOpts().triple);
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 5c66ecf..3bef6b1 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -298,6 +298,7 @@ bool CodeGenAction::beginSourceFileAction() {
   bool isOpenMPEnabled =
       ci.getInvocation().getFrontendOpts().features.IsEnabled(
           Fortran::common::LanguageFeature::OpenMP);
+  bool isOpenMPSimd = ci.getInvocation().getLangOpts().OpenMPSimd;
 
   fir::OpenMPFIRPassPipelineOpts opts;
 
@@ -329,12 +330,13 @@ bool CodeGenAction::beginSourceFileAction() {
     if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
             mlirModule->getOperation()))
       opts.isTargetDevice = offloadMod.getIsTargetDevice();
+  }
 
-    // WARNING: This pipeline must be run immediately after the lowering to
-    // ensure that the FIR is correct with respect to OpenMP operations/
-    // attributes.
+  // WARNING: This pipeline must be run immediately after the lowering to
+  // ensure that the FIR is correct with respect to OpenMP operations/
+  // attributes.
+  if (isOpenMPEnabled || isOpenMPSimd)
     fir::createOpenMPFIRPassPipeline(pm, opts);
-  }
 
   pm.enableVerifier(/*verifyPasses=*/true);
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
@@ -617,12 +619,14 @@ void CodeGenAction::lowerHLFIRToFIR() {
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
   pm.enableVerifier(/*verifyPasses=*/true);
 
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP))
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
   // Create the pass pipeline
-  fir::createHLFIRToFIRPassPipeline(
-      pm,
-      ci.getInvocation().getFrontendOpts().features.IsEnabled(
-          Fortran::common::LanguageFeature::OpenMP),
-      level);
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, level);
   (void)mlir::applyPassManagerCLOptions(pm);
 
   mlir::TimingScope timingScopeMLIRPasses = timingScopeRoot.nest(
@@ -748,6 +752,9 @@ void CodeGenAction::generateLLVMIR() {
           Fortran::common::LanguageFeature::OpenMP))
     config.EnableOpenMP = true;
 
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    config.EnableOpenMPSimd = true;
+
   if (ci.getInvocation().getLoweringOpts().getIntegerWrapAround())
     config.NSWOnLoopVarInc = false;
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index d16488d..b636416 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4827,7 +4827,9 @@ private:
 
   void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
                            const Fortran::evaluate::Assignment &assign,
-                           hlfir::Entity &lhs, hlfir::Entity &rhs) {
+                           hlfir::Entity &lhs, hlfir::Entity &rhs,
+                           bool isWholeAllocatableAssignment,
+                           bool keepLhsLengthInAllocatableAssignment) {
     bool lhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs);
     bool rhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs);
 
@@ -4892,6 +4894,28 @@ private:
 
     // host = device
     if (!lhsIsDevice && rhsIsDevice) {
+      if (Fortran::lower::isTransferWithConversion(rhs)) {
+        mlir::OpBuilder::InsertionGuard insertionGuard(builder);
+        auto elementalOp =
+            mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
+        assert(elementalOp && "expect elemental op");
+        auto designateOp =
+            *elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
+        builder.setInsertionPoint(elementalOp);
+        // Create a temp to transfer the rhs before applying the conversion.
+        hlfir::Entity entity{designateOp.getMemref()};
+        auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, entity);
+        auto transferKindAttr = cuf::DataTransferKindAttr::get(
+            builder.getContext(), cuf::DataTransferKind::DeviceHost);
+        cuf::DataTransferOp::create(builder, loc, designateOp.getMemref(), temp,
+                                    /*shape=*/mlir::Value{}, transferKindAttr);
+        designateOp.getMemrefMutable().assign(temp);
+        builder.setInsertionPointAfter(elementalOp);
+        hlfir::AssignOp::create(builder, loc, elementalOp, lhs,
+                                isWholeAllocatableAssignment,
+                                keepLhsLengthInAllocatableAssignment);
+        return;
+      }
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceHost);
       cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
@@ -5039,7 +5063,9 @@ private:
       hlfir::Entity rhs = evaluateRhs(localStmtCtx);
       hlfir::Entity lhs = evaluateLhs(localStmtCtx);
       if (isCUDATransfer && !hasCUDAImplicitTransfer)
-        genCUDADataTransfer(builder, loc, assign, lhs, rhs);
+        genCUDADataTransfer(builder, loc, assign, lhs, rhs,
+                            isWholeAllocatableAssignment,
+                            keepLhsLengthInAllocatableAssignment);
       else
         hlfir::AssignOp::create(builder, loc, rhs, lhs,
                                 isWholeAllocatableAssignment,
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index f6d0078..1293d2c 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -13,6 +13,7 @@
 #include "flang/Lower/CUDA.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 
 #define DEBUG_TYPE "flang-lower-cuda"
 
@@ -155,3 +156,12 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
       Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
+
+bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
+  if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
+    if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
+        llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
+        llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
+      return true;
+  return false;
+}
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 7f894af..c46bdb3 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -208,11 +208,15 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
     if (!x)
       return;
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(currentLocation,
-         "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
-             " in " +
-             llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
-             " construct");
+    bool isSimdDirective = llvm::omp::getOpenMPDirectiveName(directive, version)
+                               .upper()
+                               .find("SIMD") != llvm::StringRef::npos;
+    if (!semaCtx.langOptions().OpenMPSimd || isSimdDirective)
+      TODO(currentLocation,
+           "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
+               " in " +
+               llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
+               " construct");
   };
 
   for (ClauseIterator it = clauses.begin(); it != clauses.end(); ++it)
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ae60432a..fef64cc 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2262,7 +2262,8 @@ genOrderedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
              semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
              mlir::Location loc, const ConstructQueue &queue,
              ConstructQueue::const_iterator item) {
-  TODO(loc, "OMPD_ordered");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "OMPD_ordered");
   return nullptr;
 }
 
@@ -2449,7 +2450,8 @@ genScopeOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
-  TODO(loc, "Scope construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Scope construct");
   return nullptr;
 }
 
@@ -3276,7 +3278,8 @@ static mlir::omp::TaskloopOp genCompositeTaskloopSimd(
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
-  TODO(loc, "Composite TASKLOOP SIMD");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Composite TASKLOOP SIMD");
   return nullptr;
 }
 
@@ -3448,8 +3451,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_tile: {
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(loc, "Unhandled loop directive (" +
-                  llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    if (!semaCtx.langOptions().OpenMPSimd)
+      TODO(loc, "Unhandled loop directive (" +
+                    llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    break;
   }
   case llvm::omp::Directive::OMPD_unroll:
     genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
@@ -3484,35 +3489,40 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDeclarativeAssumes &assumesConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OmpDeclareVariantDirective &declareVariantDirective) {
-  TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
 }
 
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareReductionConstruct &declareReductionConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareSimdConstruct &declareSimdConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
 }
 
 static void
@@ -3706,14 +3716,16 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   (void)objects;
   (void)clauses;
 
-  TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPInteropConstruct &interopConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
 }
 
 static void
@@ -3729,7 +3741,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAllocatorsConstruct &allocsConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
 }
 
 //===----------------------------------------------------------------------===//
@@ -3795,7 +3808,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::Detach>(clause.u)) {
       std::string name =
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
-      TODO(clauseLocation, name + " clause is not implemented yet");
+      if (!semaCtx.langOptions().OpenMPSimd)
+        TODO(clauseLocation, name + " clause is not implemented yet");
     }
   }
 
@@ -3811,7 +3825,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAssumeConstruct &assumeConstruct) {
   mlir::Location clauseLocation = converter.genLocation(assumeConstruct.source);
-  TODO(clauseLocation, "OpenMP ASSUME construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(clauseLocation, "OpenMP ASSUME construct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3847,21 +3862,24 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDispatchConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index fff060b..1b09801 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -616,6 +616,8 @@ void PopulateInitAndCleanupRegionsHelper::populateByRefInitAndCleanupRegions() {
     assert(sym && "Symbol information is required to privatize derived types");
     assert(!scalarInitValue && "ScalarInitvalue is unused for privatization");
   }
+  if (hlfir::Entity{moldArg}.isAssumedRank())
+    TODO(loc, "Privatization of assumed rank variable");
   mlir::Type valTy = fir::unwrapRefType(argType);
 
   if (fir::isa_trivial(valTy)) {
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index e315433..3fb0bac 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms
   MarkDeclareTarget.cpp
   LowerWorkshare.cpp
   LowerNontemporal.cpp
+  SimdOnly.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
new file mode 100644
index 0000000..7d332fa
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -0,0 +1,212 @@
+//===-- SimdOnly.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+namespace flangomp {
+#define GEN_PASS_DEF_SIMDONLYPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+namespace {
+
+#define DEBUG_TYPE "omp-simd-only-pass"
+
+/// Rewrite and remove OpenMP operations left after the parse tree rewriting for
+/// -fopenmp-simd is done. If possible, OpenMP constructs should be rewritten at
+/// the parse tree stage. This pass is supposed to only handle complexities
+/// around untangling composite simd constructs, and perform the necessary
+/// cleanup.
+class SimdOnlyConversionPattern : public mlir::RewritePattern {
+public:
+  SimdOnlyConversionPattern(mlir::MLIRContext *ctx)
+      : mlir::RewritePattern(MatchAnyOpTypeTag{}, 1, ctx) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (op->getDialect()->getNamespace() !=
+        mlir::omp::OpenMPDialect::getDialectNamespace())
+      return rewriter.notifyMatchFailure(op, "Not an OpenMP op");
+
+    if (auto simdOp = mlir::dyn_cast<mlir::omp::SimdOp>(op)) {
+      // Remove the composite attr given that the op will no longer be composite
+      if (simdOp.isComposite()) {
+        simdOp.setComposite(false);
+        return mlir::success();
+      }
+
+      return rewriter.notifyMatchFailure(op, "Op is a plain SimdOp");
+    }
+
+    if (op->getParentOfType<mlir::omp::SimdOp>() &&
+        (mlir::isa<mlir::omp::YieldOp>(op) ||
+         mlir::isa<mlir::omp::ScanOp>(op) ||
+         mlir::isa<mlir::omp::LoopNestOp>(op) ||
+         mlir::isa<mlir::omp::TerminatorOp>(op)))
+      return rewriter.notifyMatchFailure(op, "Op is part of a simd construct");
+
+    if (!mlir::isa<mlir::func::FuncOp>(op->getParentOp()) &&
+        (mlir::isa<mlir::omp::TerminatorOp>(op) ||
+         mlir::isa<mlir::omp::YieldOp>(op)))
+      return rewriter.notifyMatchFailure(op,
+                                         "Non top-level yield or terminator");
+
+    LLVM_DEBUG(llvm::dbgs() << "SimdOnlyPass matched OpenMP op:\n");
+    LLVM_DEBUG(op->dump());
+
+    auto eraseUnlessUsedBySimd = [&](mlir::Operation *ompOp,
+                                     mlir::StringAttr name) {
+      if (auto uses =
+              mlir::SymbolTable::getSymbolUses(name, op->getParentOp())) {
+        for (auto &use : *uses)
+          if (mlir::isa<mlir::omp::SimdOp>(use.getUser()))
+            return rewriter.notifyMatchFailure(op,
+                                               "Op used by a simd construct");
+      }
+      rewriter.eraseOp(ompOp);
+      return mlir::success();
+    };
+
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::PrivateClauseOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::DeclareReductionOp>(op))
+      return eraseUnlessUsedBySimd(ompOp, ompOp.getSymNameAttr());
+
+    // Might be left over from rewriting composite simd with target map
+    if (mlir::isa<mlir::omp::MapBoundsOp>(op)) {
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+    if (auto mapInfoOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(op)) {
+      mapInfoOp.getResult().replaceAllUsesWith(mapInfoOp.getVarPtr());
+      rewriter.eraseOp(mapInfoOp);
+      return mlir::success();
+    }
+
+    // Might be leftover after parse tree rewriting
+    if (auto threadPrivateOp = mlir::dyn_cast<mlir::omp::ThreadprivateOp>(op)) {
+      threadPrivateOp.getTlsAddr().replaceAllUsesWith(
+          threadPrivateOp.getSymAddr());
+      rewriter.eraseOp(threadPrivateOp);
+      return mlir::success();
+    }
+
+    fir::FirOpBuilder builder(rewriter, op);
+    mlir::Location loc = op->getLoc();
+
+    auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
+      if (!ompOp)
+        return false;
+
+      assert("OpenMP operation has one region" && ompOp->getNumRegions() == 1);
+
+      llvm::SmallVector<std::pair<mlir::Value, mlir::BlockArgument>>
+          blockArgsPairs;
+      if (auto iface =
+              mlir::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(op)) {
+        iface.getBlockArgsPairs(blockArgsPairs);
+        for (auto [value, argument] : blockArgsPairs)
+          rewriter.replaceAllUsesWith(argument, value);
+      }
+
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        // This block is about to be removed so any arguments should have been
+        // replaced by now.
+        block.eraseArguments(0, block.getNumArguments());
+        if (auto terminatorOp =
+                mlir::dyn_cast<mlir::omp::TerminatorOp>(block.back())) {
+          rewriter.eraseOp(terminatorOp);
+        }
+        rewriter.inlineBlockBefore(&block, ompOp, {});
+      } else {
+        // When dealing with multi-block regions we need to fix up the control
+        // flow
+        auto *origBlock = ompOp->getBlock();
+        auto *newBlock = rewriter.splitBlock(origBlock, ompOp->getIterator());
+        auto *innerFrontBlock = &ompOp->getRegion(0).getBlocks().front();
+        builder.setInsertionPointToEnd(origBlock);
+        mlir::cf::BranchOp::create(builder, loc, innerFrontBlock);
+        // We are no longer passing any arguments to the first block in the
+        // region, so this should be safe to erase.
+        innerFrontBlock->eraseArguments(0, innerFrontBlock->getNumArguments());
+
+        for (auto &innerBlock : ompOp->getRegion(0).getBlocks()) {
+          // Remove now-unused block arguments
+          for (auto arg : innerBlock.getArguments()) {
+            if (arg.getUses().empty())
+              innerBlock.eraseArgument(arg.getArgNumber());
+          }
+          if (auto terminatorOp =
+                  mlir::dyn_cast<mlir::omp::TerminatorOp>(innerBlock.back())) {
+            builder.setInsertionPointToEnd(&innerBlock);
+            mlir::cf::BranchOp::create(builder, loc, newBlock);
+            rewriter.eraseOp(terminatorOp);
+          }
+        }
+
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), newBlock);
+      }
+
+      rewriter.eraseOp(op);
+      return true;
+    };
+
+    // Remove ops that will be surrounding simd once a composite simd construct
+    // goes through the codegen stage. All of the other ones should have alredy
+    // been removed in the parse tree rewriting stage.
+    if (inlineSimpleOp(mlir::dyn_cast<mlir::omp::TeamsOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::ParallelOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::WsloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::DistributeOp>(op)))
+      return mlir::success();
+
+    op->emitOpError("left unhandled after SimdOnly pass.");
+    return mlir::failure();
+  }
+};
+
+class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
+
+public:
+  SimdOnlyPass() = default;
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    mlir::MLIRContext *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<SimdOnlyConversionPattern>(context);
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks.
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(module, std::move(patterns), config))) {
+      mlir::emitError(module.getLoc(), "Error in SimdOnly conversion pass");
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index ca8e8206..5a87092 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -242,7 +242,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
+void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+                                  EnableOpenMP enableOpenMP,
                                   llvm::OptimizationLevel optLevel) {
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
@@ -294,8 +295,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
-  if (enableOpenMP)
+  if (enableOpenMP != EnableOpenMP::None)
     pm.addPass(flangomp::createLowerWorkshare());
+  if (enableOpenMP == EnableOpenMP::Simd)
+    pm.addPass(flangomp::createSimdOnlyPass());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
@@ -396,7 +399,12 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
                                   MLIRToLLVMPassPipelineConfig &config,
                                   llvm::StringRef inputFilename) {
-  fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel);
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (config.EnableOpenMP)
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (config.EnableOpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, config.OptLevel);
 
   // Add default optimizer pass pipeline.
   fir::createDefaultFIROptimizerPassPipeline(pm, config);
diff --git a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
index 79ed85f..2bca0d9 100644
--- a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
+++ b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
@@ -36,7 +36,7 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     mlir::Value high = doLoopOp.getUpperBound();
     assert(low && high && "must be a Value");
     mlir::Value step = doLoopOp.getStep();
-    llvm::SmallVector<mlir::Value> iterArgs;
+    mlir::SmallVector<mlir::Value> iterArgs;
     if (hasFinalValue)
       iterArgs.push_back(low);
     iterArgs.append(doLoopOp.getIterOperands().begin(),
@@ -88,6 +88,73 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
   }
 };
 
+struct IterWhileConversion : public mlir::OpRewritePattern<fir::IterWhileOp> {
+  using OpRewritePattern<fir::IterWhileOp>::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::IterWhileOp iterWhileOp,
+                  mlir::PatternRewriter &rewriter) const override {
+
+    mlir::Location loc = iterWhileOp.getLoc();
+    mlir::Value lowerBound = iterWhileOp.getLowerBound();
+    mlir::Value upperBound = iterWhileOp.getUpperBound();
+    mlir::Value step = iterWhileOp.getStep();
+
+    mlir::Value okInit = iterWhileOp.getIterateIn();
+    mlir::ValueRange iterArgs = iterWhileOp.getInitArgs();
+
+    mlir::SmallVector<mlir::Value> initVals;
+    initVals.push_back(lowerBound);
+    initVals.push_back(okInit);
+    initVals.append(iterArgs.begin(), iterArgs.end());
+
+    mlir::SmallVector<mlir::Type> loopTypes;
+    loopTypes.push_back(lowerBound.getType());
+    loopTypes.push_back(okInit.getType());
+    for (auto val : iterArgs)
+      loopTypes.push_back(val.getType());
+
+    auto scfWhileOp =
+        mlir::scf::WhileOp::create(rewriter, loc, loopTypes, initVals);
+
+    auto &beforeBlock = *rewriter.createBlock(
+        &scfWhileOp.getBefore(), scfWhileOp.getBefore().end(), loopTypes,
+        mlir::SmallVector<mlir::Location>(loopTypes.size(), loc));
+
+    mlir::Region::BlockArgListType argsInBefore =
+        scfWhileOp.getBefore().getArguments();
+    auto ivInBefore = argsInBefore[0];
+    auto earlyExitInBefore = argsInBefore[1];
+
+    rewriter.setInsertionPointToStart(&beforeBlock);
+
+    mlir::Value inductionCmp = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::sle, ivInBefore, upperBound);
+    mlir::Value cond = mlir::arith::AndIOp::create(rewriter, loc, inductionCmp,
+                                                   earlyExitInBefore);
+
+    mlir::scf::ConditionOp::create(rewriter, loc, cond, argsInBefore);
+
+    rewriter.moveBlockBefore(iterWhileOp.getBody(), &scfWhileOp.getAfter(),
+                             scfWhileOp.getAfter().begin());
+
+    auto *afterBody = scfWhileOp.getAfterBody();
+    auto resultOp = mlir::cast<fir::ResultOp>(afterBody->getTerminator());
+    mlir::SmallVector<mlir::Value> results(resultOp->getOperands());
+    mlir::Value ivInAfter = scfWhileOp.getAfterArguments()[0];
+
+    rewriter.setInsertionPointToStart(afterBody);
+    results[0] = mlir::arith::AddIOp::create(rewriter, loc, ivInAfter, step);
+
+    rewriter.setInsertionPointToEnd(afterBody);
+    rewriter.replaceOpWithNewOp<mlir::scf::YieldOp>(resultOp, results);
+
+    scfWhileOp->setAttrs(iterWhileOp->getAttrs());
+    rewriter.replaceOp(iterWhileOp, scfWhileOp);
+    return mlir::success();
+  }
+};
+
 void copyBlockAndTransformResult(mlir::PatternRewriter &rewriter,
                                  mlir::Block &srcBlock, mlir::Block &dstBlock) {
   mlir::Operation *srcTerminator = srcBlock.getTerminator();
@@ -132,9 +199,10 @@ struct IfConversion : public mlir::OpRewritePattern<fir::IfOp> {
 
 void FIRToSCFPass::runOnOperation() {
   mlir::RewritePatternSet patterns(&getContext());
-  patterns.add<DoLoopConversion, IfConversion>(patterns.getContext());
+  patterns.add<DoLoopConversion, IterWhileConversion, IfConversion>(
+      patterns.getContext());
   mlir::ConversionTarget target(getContext());
-  target.addIllegalOp<fir::DoLoopOp, fir::IfOp>();
+  target.addIllegalOp<fir::DoLoopOp, fir::IterWhileOp, fir::IfOp>();
   target.markUnknownOpDynamicallyLegal([](mlir::Operation *) { return true; });
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp
index 0c0e615..50e63d3 100644
--- a/flang/lib/Semantics/check-omp-atomic.cpp
+++ b/flang/lib/Semantics/check-omp-atomic.cpp
@@ -13,7 +13,9 @@
 #include "check-omp-structure.h"
 
 #include "flang/Common/indirection.h"
+#include "flang/Common/template.h"
 #include "flang/Evaluate/expression.h"
+#include "flang/Evaluate/match.h"
 #include "flang/Evaluate/rewrite.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Parser/char-block.h"
@@ -50,6 +52,137 @@ static bool operator!=(const evaluate::Expr<T> &e, const evaluate::Expr<U> &f) {
   return !(e == f);
 }
 
+namespace {
+template <typename...> struct IsIntegral {
+  static constexpr bool value{false};
+};
+
+template <common::TypeCategory C, int K>
+struct IsIntegral<evaluate::Type<C, K>> {
+  static constexpr bool value{//
+      C == common::TypeCategory::Integer ||
+      C == common::TypeCategory::Unsigned ||
+      C == common::TypeCategory::Logical};
+};
+
+template <typename T> constexpr bool is_integral_v{IsIntegral<T>::value};
+
+template <typename T, typename Op0, typename Op1>
+using ReassocOpBase = evaluate::match::AnyOfPattern< //
+    evaluate::match::Add<T, Op0, Op1>, //
+    evaluate::match::Mul<T, Op0, Op1>>;
+
+template <typename T, typename Op0, typename Op1>
+struct ReassocOp : public ReassocOpBase<T, Op0, Op1> {
+  using Base = ReassocOpBase<T, Op0, Op1>;
+  using Base::Base;
+};
+
+template <typename T, typename Op0, typename Op1>
+ReassocOp<T, Op0, Op1> reassocOp(const Op0 &op0, const Op1 &op1) {
+  return ReassocOp<T, Op0, Op1>(op0, op1);
+}
+} // namespace
+
+struct ReassocRewriter : public evaluate::rewrite::Identity {
+  using Id = evaluate::rewrite::Identity;
+  struct NonIntegralTag {};
+
+  ReassocRewriter(const SomeExpr &atom) : atom_(atom) {}
+
+  // Try to find cases where the input expression is of the form
+  // (1) (a . b) . c, or
+  // (2) a . (b . c),
+  // where . denotes an associative operation (currently + or *), and a, b, c
+  // are some subexpresions.
+  // If one of the operands in the nested operation is the atomic variable
+  // (with some possible type conversions applied to it), bring it to the
+  // top-level operation, and move the top-level operand into the nested
+  // operation.
+  // For example, assuming x is the atomic variable:
+  //   (a + x) + b  ->  (a + b) + x,  i.e. (conceptually) swap x and b.
+  template <typename T, typename U,
+      typename = std::enable_if_t<is_integral_v<T>>>
+  evaluate::Expr<T> operator()(evaluate::Expr<T> &&x, const U &u) {
+    // As per the above comment, there are 3 subexpressions involved in this
+    // transformation. A match::Expr<T> will match evaluate::Expr<U> when T is
+    // same as U, plus it will store a pointer (ref) to the matched expression.
+    // When the match is successful, the sub[i].ref will point to a, b, x (in
+    // some order) from the example above.
+    evaluate::match::Expr<T> sub[3];
+    auto inner{reassocOp<T>(sub[0], sub[1])};
+    auto outer1{reassocOp<T>(inner, sub[2])}; // inner + something
+    auto outer2{reassocOp<T>(sub[2], inner)}; // something + inner
+#if !defined(__clang__) && !defined(_MSC_VER) && \
+    (__GNUC__ < 8 || (__GNUC__ == 8 && __GNUC_MINOR__ < 5))
+    // If GCC version < 8.5, use this definition. For the other definition
+    // (which is equivalent), GCC 7.5 emits a somewhat cryptic error:
+    //    use of ‘outer1’ before deduction of ‘auto’
+    // inside of the visitor function in common::visit.
+    // Since this works with clang, MSVC and at least GCC 8.5, I'm assuming
+    // that this is some kind of a GCC issue.
+    using MatchTypes = std::tuple<evaluate::Add<T>, evaluate::Multiply<T>>;
+#else
+    using MatchTypes = typename decltype(outer1)::MatchTypes;
+#endif
+    // There is no way to ensure that the outer operation is the same as
+    // the inner one. They are matched independently, so we need to compare
+    // the index in the member variant that represents the matched type.
+    if ((match(outer1, x) && outer1.ref.index() == inner.ref.index()) ||
+        (match(outer2, x) && outer2.ref.index() == inner.ref.index())) {
+      size_t atomIdx{[&]() { // sub[atomIdx] will be the atom.
+        size_t idx;
+        for (idx = 0; idx != 3; ++idx) {
+          if (IsAtom(*sub[idx].ref)) {
+            break;
+          }
+        }
+        return idx;
+      }()};
+
+      if (atomIdx > 2) {
+        return Id::operator()(std::move(x), u);
+      }
+      return common::visit(
+          [&](auto &&s) {
+            using Expr = evaluate::Expr<T>;
+            using TypeS = llvm::remove_cvref_t<decltype(s)>;
+            // This visitor has to be semantically correct for all possible
+            // types of s even though at runtime s will only be one of the
+            // matched types.
+            // Limit the construction to the operation types that we tried
+            // to match (otherwise TypeS(op1, op2) would fail for non-binary
+            // operations).
+            if constexpr (common::HasMember<TypeS, MatchTypes>) {
+              Expr atom{*sub[atomIdx].ref};
+              Expr op1{*sub[(atomIdx + 1) % 3].ref};
+              Expr op2{*sub[(atomIdx + 2) % 3].ref};
+              return Expr(
+                  TypeS(atom, Expr(TypeS(std::move(op1), std::move(op2)))));
+            } else {
+              return Expr(TypeS(s));
+            }
+          },
+          evaluate::match::deparen(x).u);
+    }
+    return Id::operator()(std::move(x), u);
+  }
+
+  template <typename T, typename U,
+      typename = std::enable_if_t<!is_integral_v<T>>>
+  evaluate::Expr<T> operator()(
+      evaluate::Expr<T> &&x, const U &u, NonIntegralTag = {}) {
+    return Id::operator()(std::move(x), u);
+  }
+
+private:
+  template <typename T> bool IsAtom(const evaluate::Expr<T> &x) const {
+    return IsSameOrConvertOf(evaluate::AsGenericExpr(AsRvalue(x)), atom_);
+  }
+
+  const SomeExpr &atom_;
+};
+
 struct AnalyzedCondStmt {
   SomeExpr cond{evaluate::NullPointer{}}; // Default ctor is deleted
   parser::CharBlock source;
@@ -199,6 +332,26 @@ static std::pair<parser::CharBlock, parser::CharBlock> SplitAssignmentSource(
   llvm_unreachable("Could not find assignment operator");
 }
 
+static std::vector<SomeExpr> GetNonAtomExpressions(
+    const SomeExpr &atom, const std::vector<SomeExpr> &exprs) {
+  std::vector<SomeExpr> nonAtom;
+  for (const SomeExpr &e : exprs) {
+    if (!IsSameOrConvertOf(e, atom)) {
+      nonAtom.push_back(e);
+    }
+  }
+  return nonAtom;
+}
+
+static std::vector<SomeExpr> GetNonAtomArguments(
+    const SomeExpr &atom, const SomeExpr &expr) {
+  if (auto &&maybe{GetConvertInput(expr)}) {
+    return GetNonAtomExpressions(
+        atom, GetTopLevelOperationIgnoreResizing(*maybe).second);
+  }
+  return {};
+}
+
 static bool IsCheckForAssociated(const SomeExpr &cond) {
   return GetTopLevelOperationIgnoreResizing(cond).first ==
       operation::Operator::Associated;
@@ -576,6 +729,7 @@ void OmpStructureChecker::CheckAtomicCaptureAssignment(
     const evaluate::Assignment &capture, const SomeExpr &atom,
     parser::CharBlock source) {
   auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  (void)lsrc;
   const SomeExpr &cap{capture.lhs};
 
   if (!IsVarOrFunctionRef(atom)) {
@@ -592,6 +746,7 @@ void OmpStructureChecker::CheckAtomicCaptureAssignment(
 void OmpStructureChecker::CheckAtomicReadAssignment(
     const evaluate::Assignment &read, parser::CharBlock source) {
   auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  (void)lsrc;
 
   if (auto maybe{GetConvertInput(read.rhs)}) {
     const SomeExpr &atom{*maybe};
@@ -625,7 +780,8 @@ void OmpStructureChecker::CheckAtomicWriteAssignment(
   }
 }
 
-void OmpStructureChecker::CheckAtomicUpdateAssignment(
+std::optional<evaluate::Assignment>
+OmpStructureChecker::CheckAtomicUpdateAssignment(
     const evaluate::Assignment &update, parser::CharBlock source) {
   // [6.0:191:1-7]
   // An update structured block is update-statement, an update statement
@@ -641,14 +797,47 @@ void OmpStructureChecker::CheckAtomicUpdateAssignment(
   if (!IsVarOrFunctionRef(atom)) {
     ErrorShouldBeVariable(atom, rsrc);
     // Skip other checks.
-    return;
+    return std::nullopt;
   }
 
   CheckAtomicVariable(atom, lsrc);
 
+  auto [hasErrors, tryReassoc]{CheckAtomicUpdateAssignmentRhs(
+      atom, update.rhs, source, /*suppressDiagnostics=*/true)};
+
+  if (!hasErrors) {
+    CheckStorageOverlap(atom, GetNonAtomArguments(atom, update.rhs), source);
+    return std::nullopt;
+  } else if (tryReassoc) {
+    ReassocRewriter ra(atom);
+    SomeExpr raRhs{evaluate::rewrite::Mutator(ra)(update.rhs)};
+
+    std::tie(hasErrors, tryReassoc) = CheckAtomicUpdateAssignmentRhs(
+        atom, raRhs, source, /*suppressDiagnostics=*/true);
+    if (!hasErrors) {
+      CheckStorageOverlap(atom, GetNonAtomArguments(atom, raRhs), source);
+
+      evaluate::Assignment raAssign(update);
+      raAssign.rhs = raRhs;
+      return raAssign;
+    }
+  }
+
+  // This is guaranteed to report errors.
+  CheckAtomicUpdateAssignmentRhs(
+      atom, update.rhs, source, /*suppressDiagnostics=*/false);
+  return std::nullopt;
+}
+
+std::pair<bool, bool> OmpStructureChecker::CheckAtomicUpdateAssignmentRhs(
+    const SomeExpr &atom, const SomeExpr &rhs, parser::CharBlock source,
+    bool suppressDiagnostics) {
+  auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  (void)lsrc;
+
   std::pair<operation::Operator, std::vector<SomeExpr>> top{
       operation::Operator::Unknown, {}};
-  if (auto &&maybeInput{GetConvertInput(update.rhs)}) {
+  if (auto &&maybeInput{GetConvertInput(rhs)}) {
     top = GetTopLevelOperationIgnoreResizing(*maybeInput);
   }
   switch (top.first) {
@@ -665,29 +854,39 @@ void OmpStructureChecker::CheckAtomicUpdateAssignment(
   case operation::Operator::Identity:
     break;
   case operation::Operator::Call:
-    context_.Say(source,
-        "A call to this function is not a valid ATOMIC UPDATE operation"_err_en_US);
-    return;
+    if (!suppressDiagnostics) {
+      context_.Say(source,
+          "A call to this function is not a valid ATOMIC UPDATE operation"_err_en_US);
+    }
+    return std::make_pair(true, false);
   case operation::Operator::Convert:
-    context_.Say(source,
-        "An implicit or explicit type conversion is not a valid ATOMIC UPDATE operation"_err_en_US);
-    return;
+    if (!suppressDiagnostics) {
+      context_.Say(source,
+          "An implicit or explicit type conversion is not a valid ATOMIC UPDATE operation"_err_en_US);
+    }
+    return std::make_pair(true, false);
   case operation::Operator::Intrinsic:
-    context_.Say(source,
-        "This intrinsic function is not a valid ATOMIC UPDATE operation"_err_en_US);
-    return;
+    if (!suppressDiagnostics) {
+      context_.Say(source,
+          "This intrinsic function is not a valid ATOMIC UPDATE operation"_err_en_US);
+    }
+    return std::make_pair(true, false);
   case operation::Operator::Constant:
   case operation::Operator::Unknown:
-    context_.Say(
-        source, "This is not a valid ATOMIC UPDATE operation"_err_en_US);
-    return;
+    if (!suppressDiagnostics) {
+      context_.Say(
+          source, "This is not a valid ATOMIC UPDATE operation"_err_en_US);
+    }
+    return std::make_pair(true, false);
   default:
     assert(
         top.first != operation::Operator::Identity && "Handle this separately");
-    context_.Say(source,
-        "The %s operator is not a valid ATOMIC UPDATE operation"_err_en_US,
-        operation::ToString(top.first));
-    return;
+    if (!suppressDiagnostics) {
+      context_.Say(source,
+          "The %s operator is not a valid ATOMIC UPDATE operation"_err_en_US,
+          operation::ToString(top.first));
+    }
+    return std::make_pair(true, false);
   }
   // Check how many times `atom` occurs as an argument, if it's a subexpression
   // of an argument, and collect the non-atom arguments.
@@ -708,39 +907,48 @@ void OmpStructureChecker::CheckAtomicUpdateAssignment(
     return count;
   }()};
 
-  bool hasError{false};
+  bool hasError{false}, tryReassoc{false};
   if (subExpr) {
-    context_.Say(rsrc,
-        "The atomic variable %s cannot be a proper subexpression of an argument (here: %s) in the update operation"_err_en_US,
-        atom.AsFortran(), subExpr->AsFortran());
+    if (!suppressDiagnostics) {
+      context_.Say(rsrc,
+          "The atomic variable %s cannot be a proper subexpression of an argument (here: %s) in the update operation"_err_en_US,
+          atom.AsFortran(), subExpr->AsFortran());
+    }
     hasError = true;
   }
   if (top.first == operation::Operator::Identity) {
     // This is "x = y".
     assert((atomCount == 0 || atomCount == 1) && "Unexpected count");
     if (atomCount == 0) {
-      context_.Say(rsrc,
-          "The atomic variable %s should appear as an argument in the update operation"_err_en_US,
-          atom.AsFortran());
+      if (!suppressDiagnostics) {
+        context_.Say(rsrc,
+            "The atomic variable %s should appear as an argument in the update operation"_err_en_US,
+            atom.AsFortran());
+      }
       hasError = true;
     }
   } else {
     if (atomCount == 0) {
-      context_.Say(rsrc,
-          "The atomic variable %s should appear as an argument of the top-level %s operator"_err_en_US,
-          atom.AsFortran(), operation::ToString(top.first));
+      if (!suppressDiagnostics) {
+        context_.Say(rsrc,
+            "The atomic variable %s should appear as an argument of the top-level %s operator"_err_en_US,
+            atom.AsFortran(), operation::ToString(top.first));
+      }
+      // If `atom` is a proper subexpression, and it not present as an
+      // argument on its own, reassociation may be able to help.
+      tryReassoc = subExpr.has_value();
       hasError = true;
     } else if (atomCount > 1) {
-      context_.Say(rsrc,
-          "The atomic variable %s should be exactly one of the arguments of the top-level %s operator"_err_en_US,
-          atom.AsFortran(), operation::ToString(top.first));
+      if (!suppressDiagnostics) {
+        context_.Say(rsrc,
+            "The atomic variable %s should be exactly one of the arguments of the top-level %s operator"_err_en_US,
+            atom.AsFortran(), operation::ToString(top.first));
+      }
       hasError = true;
     }
   }
 
-  if (!hasError) {
-    CheckStorageOverlap(atom, nonAtom, source);
-  }
+  return std::make_pair(hasError, tryReassoc);
 }
 
 void OmpStructureChecker::CheckAtomicConditionalUpdateAssignment(
@@ -843,11 +1051,13 @@ void OmpStructureChecker::CheckAtomicUpdateOnly(
     SourcedActionStmt action{GetActionStmt(&body.front())};
     if (auto maybeUpdate{GetEvaluateAssignment(action.stmt)}) {
       const SomeExpr &atom{maybeUpdate->lhs};
-      CheckAtomicUpdateAssignment(*maybeUpdate, action.source);
+      auto maybeAssign{
+          CheckAtomicUpdateAssignment(*maybeUpdate, action.source)};
+      auto &updateAssign{maybeAssign.has_value() ? maybeAssign : maybeUpdate};
 
       using Analysis = parser::OpenMPAtomicConstruct::Analysis;
       x.analysis = AtomicAnalysis(atom)
-                       .addOp0(Analysis::Update, maybeUpdate)
+                       .addOp0(Analysis::Update, updateAssign)
                        .addOp1(Analysis::None);
     } else if (!IsAssignment(action.stmt)) {
       context_.Say(
@@ -963,16 +1173,19 @@ void OmpStructureChecker::CheckAtomicUpdateCapture(
   using Analysis = parser::OpenMPAtomicConstruct::Analysis;
   int action;
 
+  std::optional<evaluate::Assignment> updateAssign{update};
   if (IsMaybeAtomicWrite(update)) {
     action = Analysis::Write;
     CheckAtomicWriteAssignment(update, uact.source);
   } else {
     action = Analysis::Update;
-    CheckAtomicUpdateAssignment(update, uact.source);
+    if (auto &&maybe{CheckAtomicUpdateAssignment(update, uact.source)}) {
+      updateAssign = maybe;
+    }
   }
   CheckAtomicCaptureAssignment(capture, atom, cact.source);
 
-  if (IsPointerAssignment(update) != IsPointerAssignment(capture)) {
+  if (IsPointerAssignment(*updateAssign) != IsPointerAssignment(capture)) {
     context_.Say(cact.source,
         "The update and capture assignments should both be pointer-assignments or both be non-pointer-assignments"_err_en_US);
     return;
@@ -980,12 +1193,12 @@ void OmpStructureChecker::CheckAtomicUpdateCapture(
 
   if (GetActionStmt(&body.front()).stmt == uact.stmt) {
     x.analysis = AtomicAnalysis(atom)
-                     .addOp0(action, update)
+                     .addOp0(action, updateAssign)
                      .addOp1(Analysis::Read, capture);
   } else {
     x.analysis = AtomicAnalysis(atom)
                      .addOp0(Analysis::Read, capture)
-                     .addOp1(action, update);
+                     .addOp1(action, updateAssign);
   }
 }
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index cbe6b2c..bf126bb 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2891,7 +2891,8 @@ static bool CheckSymbolSupportsType(const Scope &scope,
 
 static bool IsReductionAllowedForType(
     const parser::OmpReductionIdentifier &ident, const DeclTypeSpec &type,
-    const Scope &scope, SemanticsContext &context) {
+    bool cannotBeBuiltinReduction, const Scope &scope,
+    SemanticsContext &context) {
   auto isLogical{[](const DeclTypeSpec &type) -> bool {
     return type.category() == DeclTypeSpec::Logical;
   }};
@@ -2902,6 +2903,10 @@ static bool IsReductionAllowedForType(
   auto checkOperator{[&](const parser::DefinedOperator &dOpr) {
     if (const auto *intrinsicOp{
             std::get_if<parser::DefinedOperator::IntrinsicOperator>(&dOpr.u)}) {
+      if (cannotBeBuiltinReduction) {
+        return false;
+      }
+
       // OMP5.2: The type [...] of a list item that appears in a
       // reduction clause must be valid for the combiner expression
       // See F2023: Table 10.2
@@ -2953,7 +2958,8 @@ static bool IsReductionAllowedForType(
         // IAND: arguments must be integers: F2023 16.9.100
         // IEOR: arguments must be integers: F2023 16.9.106
         // IOR: arguments must be integers: F2023 16.9.111
-        if (type.IsNumeric(TypeCategory::Integer)) {
+        if (type.IsNumeric(TypeCategory::Integer) &&
+            !cannotBeBuiltinReduction) {
           return true;
         }
       } else if (realName == "max" || realName == "min") {
@@ -2961,8 +2967,9 @@ static bool IsReductionAllowedForType(
         // F2023 16.9.135
         // MIN: arguments must be integer, real, or character:
         // F2023 16.9.141
-        if (type.IsNumeric(TypeCategory::Integer) ||
-            type.IsNumeric(TypeCategory::Real) || isCharacter(type)) {
+        if ((type.IsNumeric(TypeCategory::Integer) ||
+                type.IsNumeric(TypeCategory::Real) || isCharacter(type)) &&
+            !cannotBeBuiltinReduction) {
           return true;
         }
       }
@@ -2995,9 +3002,16 @@ void OmpStructureChecker::CheckReductionObjectTypes(
   GetSymbolsInObjectList(objects, symbols);
 
   for (auto &[symbol, source] : symbols) {
+    // Built in reductions require types which can be used in their initializer
+    // and combiner expressions. For example, for +:
+    // r = 0; r = r + r2
+    // But it might be valid to use these with DECLARE REDUCTION.
+    // Assumed size is already caught elsewhere.
+    bool cannotBeBuiltinReduction{evaluate::IsAssumedRank(*symbol)};
     if (auto *type{symbol->GetType()}) {
       const auto &scope{context_.FindScope(symbol->name())};
-      if (!IsReductionAllowedForType(ident, *type, scope, context_)) {
+      if (!IsReductionAllowedForType(
+              ident, *type, cannotBeBuiltinReduction, scope, context_)) {
         context_.Say(source,
             "The type of '%s' is incompatible with the reduction operator."_err_en_US,
             symbol->name());
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 6b33ca6..a973aee2 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -267,8 +267,10 @@ private:
       const evaluate::Assignment &read, parser::CharBlock source);
   void CheckAtomicWriteAssignment(
       const evaluate::Assignment &write, parser::CharBlock source);
-  void CheckAtomicUpdateAssignment(
+  std::optional<evaluate::Assignment> CheckAtomicUpdateAssignment(
       const evaluate::Assignment &update, parser::CharBlock source);
+  std::pair<bool, bool> CheckAtomicUpdateAssignmentRhs(const SomeExpr &atom,
+      const SomeExpr &rhs, parser::CharBlock source, bool suppressDiagnostics);
   void CheckAtomicConditionalUpdateAssignment(const SomeExpr &cond,
       parser::CharBlock condSource, const evaluate::Assignment &assign,
       parser::CharBlock assignSource);
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index b4c83ba..1c45438 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -285,21 +285,22 @@ template <typename DSV>
 std::optional<std::pair<SomeExpr, bool>>
 DataInitializationCompiler<DSV>::ConvertElement(
     const SomeExpr &expr, const evaluate::DynamicType &type) {
+  evaluate::FoldingContext &foldingContext{exprAnalyzer_.GetFoldingContext()};
+  evaluate::CheckRealWidening(expr, type, foldingContext);
   if (auto converted{evaluate::ConvertToType(type, SomeExpr{expr})}) {
     return {std::make_pair(std::move(*converted), false)};
   }
   // Allow DATA initialization with Hollerith and kind=1 CHARACTER like
   // (most) other Fortran compilers do.
-  if (auto converted{evaluate::HollerithToBOZ(
-          exprAnalyzer_.GetFoldingContext(), expr, type)}) {
+  if (auto converted{evaluate::HollerithToBOZ(foldingContext, expr, type)}) {
     return {std::make_pair(std::move(*converted), true)};
   }
   SemanticsContext &context{exprAnalyzer_.context()};
   if (context.IsEnabled(common::LanguageFeature::LogicalIntegerAssignment)) {
     if (MaybeExpr converted{evaluate::DataConstantConversionExtension(
-            exprAnalyzer_.GetFoldingContext(), type, expr)}) {
+            foldingContext, type, expr)}) {
       context.Warn(common::LanguageFeature::LogicalIntegerAssignment,
-          exprAnalyzer_.GetFoldingContext().messages().at(),
+          foldingContext.messages().at(),
           "nonstandard usage: initialization of %s with %s"_port_en_US,
           type.AsFortran(), expr.GetType().value().AsFortran());
       return {std::make_pair(std::move(*converted), false)};
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 92dbe0e..d022378 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -828,7 +828,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(
 
 template <typename TYPE>
 Constant<TYPE> ReadRealLiteral(
-    parser::CharBlock source, FoldingContext &context) {
+    parser::CharBlock source, FoldingContext &context, bool isDefaultKind) {
   const char *p{source.begin()};
   auto valWithFlags{
       Scalar<TYPE>::Read(p, context.targetCharacteristics().roundingMode())};
@@ -838,19 +838,24 @@ Constant<TYPE> ReadRealLiteral(
   if (context.targetCharacteristics().areSubnormalsFlushedToZero()) {
     value = value.FlushSubnormalToZero();
   }
-  return {value};
+  typename Constant<TYPE>::Result resultInfo;
+  resultInfo.set_isFromInexactLiteralConversion(
+      isDefaultKind && valWithFlags.flags.test(RealFlag::Inexact));
+  return {value, resultInfo};
 }
 
 struct RealTypeVisitor {
   using Result = std::optional<Expr<SomeReal>>;
   using Types = RealTypes;
 
-  RealTypeVisitor(int k, parser::CharBlock lit, FoldingContext &ctx)
-      : kind{k}, literal{lit}, context{ctx} {}
+  RealTypeVisitor(
+      int k, parser::CharBlock lit, FoldingContext &ctx, bool isDeftKind)
+      : kind{k}, literal{lit}, context{ctx}, isDefaultKind{isDeftKind} {}
 
   template <typename T> Result Test() {
     if (kind == T::kind) {
-      return {AsCategoryExpr(ReadRealLiteral<T>(literal, context))};
+      return {
+          AsCategoryExpr(ReadRealLiteral<T>(literal, context, isDefaultKind))};
     }
     return std::nullopt;
   }
@@ -858,6 +863,7 @@ struct RealTypeVisitor {
   int kind;
   parser::CharBlock literal;
   FoldingContext &context;
+  bool isDefaultKind;
 };
 
 // Reads a real literal constant and encodes it with the right kind.
@@ -909,8 +915,9 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::RealLiteralConstant &x) {
           "Explicit kind parameter together with non-'E' exponent letter is not standard"_port_en_US);
     }
   }
-  auto result{common::SearchTypes(
-      RealTypeVisitor{kind, x.real.source, GetFoldingContext()})};
+  bool isDefaultKind{!x.kind && letterKind.value_or('e') == 'e'};
+  auto result{common::SearchTypes(RealTypeVisitor{
+      kind, x.real.source, GetFoldingContext(), isDefaultKind})};
   if (!result) { // C717
     Say("Unsupported REAL(KIND=%d)"_err_en_US, kind);
   }
@@ -1841,8 +1848,7 @@ void ArrayConstructorContext::Push(MaybeExpr &&x) {
           if (*thisLen != *constantLength_ && !(messageDisplayedSet_ & 1)) {
             exprAnalyzer_.Warn(
                 common::LanguageFeature::DistinctArrayConstructorLengths,
-                "Character literal in array constructor without explicit "
-                "type has different length than earlier elements"_port_en_US);
+                "Character literal in array constructor without explicit type has different length than earlier elements"_port_en_US);
             messageDisplayedSet_ |= 1;
           }
           if (*thisLen > *constantLength_) {
@@ -1862,17 +1868,17 @@ void ArrayConstructorContext::Push(MaybeExpr &&x) {
     } else {
       if (!(messageDisplayedSet_ & 2)) {
         exprAnalyzer_.Say(
-            "Values in array constructor must have the same declared type "
-            "when no explicit type appears"_err_en_US); // C7110
+            "Values in array constructor must have the same declared type when no explicit type appears"_err_en_US); // C7110
         messageDisplayedSet_ |= 2;
       }
     }
   } else {
+    CheckRealWidening(*x, *type_, exprAnalyzer_.GetFoldingContext());
     if (auto cast{ConvertToType(*type_, std::move(*x))}) {
       values_.Push(std::move(*cast));
     } else if (!(messageDisplayedSet_ & 4)) {
-      exprAnalyzer_.Say("Value in array constructor of type '%s' could not "
-                        "be converted to the type of the array '%s'"_err_en_US,
+      exprAnalyzer_.Say(
+          "Value in array constructor of type '%s' could not be converted to the type of the array '%s'"_err_en_US,
           x->GetType()->AsFortran(), type_->AsFortran()); // C7111, C7112
       messageDisplayedSet_ |= 4;
     }
@@ -2065,8 +2071,9 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::ArrayConstructor &array) {
 
 // Check if implicit conversion of expr to the symbol type is legal (if needed),
 // and make it explicit if requested.
-static MaybeExpr ImplicitConvertTo(const semantics::Symbol &sym,
-    Expr<SomeType> &&expr, bool keepConvertImplicit) {
+static MaybeExpr ImplicitConvertTo(const Symbol &sym, Expr<SomeType> &&expr,
+    bool keepConvertImplicit, FoldingContext &foldingContext) {
+  CheckRealWidening(expr, DynamicType::From(sym), foldingContext);
   if (!keepConvertImplicit) {
     return ConvertToType(sym, std::move(expr));
   } else {
@@ -2293,10 +2300,12 @@ MaybeExpr ExpressionAnalyzer::CheckStructureConstructor(
       // convert would cause a segfault. Lowering will deal with
       // conditionally converting and preserving the lower bounds in this
       // case.
-      if (MaybeExpr converted{ImplicitConvertTo(
-              *symbol, std::move(value), IsAllocatable(*symbol))}) {
-        if (auto componentShape{GetShape(GetFoldingContext(), *symbol)}) {
-          if (auto valueShape{GetShape(GetFoldingContext(), *converted)}) {
+      FoldingContext &foldingContext{GetFoldingContext()};
+      if (MaybeExpr converted{ImplicitConvertTo(*symbol, std::move(value),
+              /*keepConvertImplicit=*/IsAllocatable(*symbol),
+              foldingContext)}) {
+        if (auto componentShape{GetShape(foldingContext, *symbol)}) {
+          if (auto valueShape{GetShape(foldingContext, *converted)}) {
             if (GetRank(*componentShape) == 0 && GetRank(*valueShape) > 0) {
               AttachDeclaration(
                   Say(exprSource,
@@ -2310,7 +2319,7 @@ MaybeExpr ExpressionAnalyzer::CheckStructureConstructor(
               if (checked && *checked && GetRank(*componentShape) > 0 &&
                   GetRank(*valueShape) == 0 &&
                   (IsDeferredShape(*symbol) ||
-                      !IsExpandableScalar(*converted, GetFoldingContext(),
+                      !IsExpandableScalar(*converted, foldingContext,
                           *componentShape, true /*admit PURE call*/))) {
                 AttachDeclaration(
                     Say(exprSource,
@@ -4827,6 +4836,11 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     // conversion in this case.
     if (lhsType) {
       if (rhsType) {
+        FoldingContext &foldingContext{context_.GetFoldingContext()};
+        auto restorer{foldingContext.messages().SetLocation(
+            actuals_.at(1).value().sourceLocation().value_or(
+                foldingContext.messages().at()))};
+        CheckRealWidening(rhs, lhsType, foldingContext);
         if (!IsAllocatableDesignator(lhs) || context_.inWhereBody()) {
           AddAssignmentConversion(*lhsType, *rhsType);
         }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 4eeb1b9..b301976 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -12,6 +12,7 @@
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
+#include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
@@ -41,11 +42,23 @@ public:
 
   void Post(parser::Name &);
   bool Pre(parser::MainProgram &);
+  bool Pre(parser::Module &);
   bool Pre(parser::FunctionSubprogram &);
   bool Pre(parser::SubroutineSubprogram &);
   bool Pre(parser::SeparateModuleSubprogram &);
   bool Pre(parser::BlockConstruct &);
+  bool Pre(parser::Block &);
+  bool Pre(parser::DoConstruct &);
+  bool Pre(parser::IfConstruct &);
   bool Pre(parser::ActionStmt &);
+  void Post(parser::MainProgram &);
+  void Post(parser::FunctionSubprogram &);
+  void Post(parser::SubroutineSubprogram &);
+  void Post(parser::SeparateModuleSubprogram &);
+  void Post(parser::BlockConstruct &);
+  void Post(parser::Block &);
+  void Post(parser::DoConstruct &);
+  void Post(parser::IfConstruct &);
   void Post(parser::ReadStmt &);
   void Post(parser::WriteStmt &);
 
@@ -67,8 +80,15 @@ public:
   bool Pre(parser::EndSubroutineStmt &) { return false; }
   bool Pre(parser::EndTypeStmt &) { return false; }
 
+  bool Pre(parser::OpenMPBlockConstruct &);
+  bool Pre(parser::OpenMPLoopConstruct &);
+  void Post(parser::OpenMPBlockConstruct &);
+  void Post(parser::OpenMPLoopConstruct &);
+
 private:
   void FixMisparsedStmtFuncs(parser::SpecificationPart &, parser::Block &);
+  void OpenMPSimdOnly(parser::Block &, bool);
+  void OpenMPSimdOnly(parser::SpecificationPart &);
 
   SemanticsContext &context_;
   bool errorOnUnresolvedName_{true};
@@ -96,6 +116,132 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
   return false;
 }
 
+static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
+  auto &begin = std::get<parser::OmpBeginLoopDirective>(ompLoop->t);
+  auto directive = std::get<parser::OmpLoopDirective>(begin.t).v;
+  return llvm::omp::allSimdSet.test(directive);
+}
+
+// Remove non-SIMD OpenMPConstructs once they are parsed.
+// This massively simplifies the logic inside the SimdOnlyPass for
+// -fopenmp-simd.
+void RewriteMutator::OpenMPSimdOnly(parser::SpecificationPart &specPart) {
+  auto &list{std::get<std::list<parser::DeclarationConstruct>>(specPart.t)};
+  for (auto it{list.begin()}; it != list.end();) {
+    if (auto *specConstr{std::get_if<parser::SpecificationConstruct>(&it->u)}) {
+      if (auto *ompDecl{std::get_if<
+              common::Indirection<parser::OpenMPDeclarativeConstruct>>(
+              &specConstr->u)}) {
+        if (std::holds_alternative<parser::OpenMPThreadprivate>(
+                ompDecl->value().u) ||
+            std::holds_alternative<parser::OpenMPDeclareMapperConstruct>(
+                ompDecl->value().u)) {
+          it = list.erase(it);
+          continue;
+        }
+      }
+    }
+    ++it;
+  }
+}
+
+// Remove non-SIMD OpenMPConstructs once they are parsed.
+// This massively simplifies the logic inside the SimdOnlyPass for
+// -fopenmp-simd. `isNonSimdLoopBody` should be set to true if `block` is the
+// body of a non-simd OpenMP loop. This is to indicate that scan constructs
+// should be removed from the body, where they would be kept if it were a simd
+// loop.
+void RewriteMutator::OpenMPSimdOnly(
+    parser::Block &block, bool isNonSimdLoopBody = false) {
+  auto replaceInlineBlock =
+      [&](std::list<parser::ExecutionPartConstruct> &innerBlock,
+          auto it) -> auto {
+    auto insertPos = std::next(it);
+    block.splice(insertPos, innerBlock);
+    block.erase(it);
+    return insertPos;
+  };
+
+  for (auto it{block.begin()}; it != block.end();) {
+    if (auto *stmt{std::get_if<parser::ExecutableConstruct>(&it->u)}) {
+      if (auto *omp{std::get_if<common::Indirection<parser::OpenMPConstruct>>(
+              &stmt->u)}) {
+        if (auto *ompStandalone{std::get_if<parser::OpenMPStandaloneConstruct>(
+                &omp->value().u)}) {
+          if (std::holds_alternative<parser::OpenMPCancelConstruct>(
+                  ompStandalone->u) ||
+              std::holds_alternative<parser::OpenMPFlushConstruct>(
+                  ompStandalone->u) ||
+              std::holds_alternative<parser::OpenMPCancellationPointConstruct>(
+                  ompStandalone->u)) {
+            it = block.erase(it);
+            continue;
+          }
+          if (auto *constr{std::get_if<parser::OpenMPSimpleStandaloneConstruct>(
+                  &ompStandalone->u)}) {
+            auto directive = constr->v.DirId();
+            // Scan should only be removed from non-simd loops
+            if (llvm::omp::simpleStandaloneNonSimdOnlySet.test(directive) ||
+                (isNonSimdLoopBody && directive == llvm::omp::OMPD_scan)) {
+              it = block.erase(it);
+              continue;
+            }
+          }
+        } else if (auto *ompBlock{std::get_if<parser::OpenMPBlockConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(ompBlock->t), it);
+          continue;
+        } else if (auto *ompLoop{std::get_if<parser::OpenMPLoopConstruct>(
+                       &omp->value().u)}) {
+          if (LoopConstructIsSIMD(ompLoop)) {
+            ++it;
+            continue;
+          }
+          auto &nest =
+              std::get<std::optional<parser::NestedConstruct>>(ompLoop->t);
+
+          if (auto *doConstruct =
+                  std::get_if<parser::DoConstruct>(&nest.value())) {
+            auto &loopBody = std::get<parser::Block>(doConstruct->t);
+            // We can only remove some constructs from a loop when it's _not_ a
+            // OpenMP simd loop
+            OpenMPSimdOnly(loopBody, /*isNonSimdLoopBody=*/true);
+            auto newDoConstruct = std::move(*doConstruct);
+            auto newLoop = parser::ExecutionPartConstruct{
+                parser::ExecutableConstruct{std::move(newDoConstruct)}};
+            it = block.erase(it);
+            block.insert(it, std::move(newLoop));
+            continue;
+          }
+        } else if (auto *ompCon{std::get_if<parser::OpenMPSectionsConstruct>(
+                       &omp->value().u)}) {
+          auto &sections =
+              std::get<std::list<parser::OpenMPConstruct>>(ompCon->t);
+          auto insertPos = std::next(it);
+          for (auto &sectionCon : sections) {
+            auto &section =
+                std::get<parser::OpenMPSectionConstruct>(sectionCon.u);
+            auto &innerBlock = std::get<parser::Block>(section.t);
+            block.splice(insertPos, innerBlock);
+          }
+          block.erase(it);
+          it = insertPos;
+          continue;
+        } else if (auto *atomic{std::get_if<parser::OpenMPAtomicConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(atomic->t), it);
+          continue;
+        } else if (auto *critical{std::get_if<parser::OpenMPCriticalConstruct>(
+                       &omp->value().u)}) {
+          it = replaceInlineBlock(std::get<parser::Block>(critical->t), it);
+          continue;
+        }
+      }
+    }
+    ++it;
+  }
+}
+
 // Finds misparsed statement functions in a specification part, rewrites
 // them into array element assignment statements, and moves them into the
 // beginning of the corresponding (execution part's) block.
@@ -133,33 +279,155 @@ void RewriteMutator::FixMisparsedStmtFuncs(
 bool RewriteMutator::Pre(parser::MainProgram &program) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(program.t),
       std::get<parser::ExecutionPart>(program.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(program.t).v);
+    OpenMPSimdOnly(std::get<parser::SpecificationPart>(program.t));
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::MainProgram &program) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(program.t).v);
+  }
+}
+
+bool RewriteMutator::Pre(parser::Module &module) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::SpecificationPart>(module.t));
+  }
   return true;
 }
 
 bool RewriteMutator::Pre(parser::FunctionSubprogram &func) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(func.t),
       std::get<parser::ExecutionPart>(func.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(func.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::FunctionSubprogram &func) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(func.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::SubroutineSubprogram &subr) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(subr.t),
       std::get<parser::ExecutionPart>(subr.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subr.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::SubroutineSubprogram &subr) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subr.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::SeparateModuleSubprogram &subp) {
   FixMisparsedStmtFuncs(std::get<parser::SpecificationPart>(subp.t),
       std::get<parser::ExecutionPart>(subp.t).v);
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subp.t).v);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::SeparateModuleSubprogram &subp) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::ExecutionPart>(subp.t).v);
+  }
+}
+
 bool RewriteMutator::Pre(parser::BlockConstruct &block) {
   FixMisparsedStmtFuncs(std::get<parser::BlockSpecificationPart>(block.t).v,
       std::get<parser::Block>(block.t));
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::Block>(block.t));
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::BlockConstruct &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(std::get<parser::Block>(block.t));
+  }
+}
+
+bool RewriteMutator::Pre(parser::Block &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    OpenMPSimdOnly(block);
+  }
   return true;
 }
 
+void RewriteMutator::Post(parser::Block &block) { this->Pre(block); }
+
+bool RewriteMutator::Pre(parser::OpenMPBlockConstruct &block) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(block.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::OpenMPBlockConstruct &block) {
+  this->Pre(block);
+}
+
+bool RewriteMutator::Pre(parser::OpenMPLoopConstruct &ompLoop) {
+  if (context_.langOptions().OpenMPSimd) {
+    if (LoopConstructIsSIMD(&ompLoop)) {
+      return true;
+    }
+    // If we're looking at a non-simd OpenMP loop, we need to explicitly
+    // call OpenMPSimdOnly on the nested loop block while indicating where
+    // the block comes from.
+    auto &nest = std::get<std::optional<parser::NestedConstruct>>(ompLoop.t);
+    if (!nest.has_value()) {
+      return true;
+    }
+    if (auto *doConstruct = std::get_if<parser::DoConstruct>(&*nest)) {
+      auto &innerBlock = std::get<parser::Block>(doConstruct->t);
+      OpenMPSimdOnly(innerBlock, /*isNonSimdLoopBody=*/true);
+    }
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::OpenMPLoopConstruct &ompLoop) {
+  this->Pre(ompLoop);
+}
+
+bool RewriteMutator::Pre(parser::DoConstruct &doConstruct) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(doConstruct.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::DoConstruct &doConstruct) {
+  this->Pre(doConstruct);
+}
+
+bool RewriteMutator::Pre(parser::IfConstruct &ifConstruct) {
+  if (context_.langOptions().OpenMPSimd) {
+    auto &innerBlock = std::get<parser::Block>(ifConstruct.t);
+    OpenMPSimdOnly(innerBlock);
+  }
+  return true;
+}
+
+void RewriteMutator::Post(parser::IfConstruct &ifConstruct) {
+  this->Pre(ifConstruct);
+}
+
 // Rewrite PRINT NML -> WRITE(*,NML=NML)
 bool RewriteMutator::Pre(parser::ActionStmt &x) {
   if (auto *print{std::get_if<common::Indirection<parser::PrintStmt>>(&x.u)};
diff --git a/flang/lib/Support/Fortran-features.cpp b/flang/lib/Support/Fortran-features.cpp
index df51b3c..6a61149 100644
--- a/flang/lib/Support/Fortran-features.cpp
+++ b/flang/lib/Support/Fortran-features.cpp
@@ -147,6 +147,7 @@ LanguageFeatureControl::LanguageFeatureControl() {
   warnUsage_.set(UsageWarning::UseAssociationIntoSameNameSubprogram);
   warnUsage_.set(UsageWarning::HostAssociatedIntentOutInSpecExpr);
   warnUsage_.set(UsageWarning::NonVolatilePointerToVolatile);
+  warnUsage_.set(UsageWarning::RealConstantWidening);
   // New warnings, on by default
   warnLanguage_.set(LanguageFeature::SavedLocalInSpecExpr);
   warnLanguage_.set(LanguageFeature::NullActualForAllocatable);
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 8257170..4f3625a 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -324,6 +324,20 @@ implicit none
       real(8), value :: x
     end function
   end interface
+
+  interface __sad
+    attributes(device) integer function __sad(i,j,k) bind(c, name='__nv_sad')
+      !dir$ ignore_tkr (d) i, (d) j, (d) k
+      integer, value :: i,j,k
+    end function
+  end interface
+
+  interface __usad
+    attributes(device) integer function __usad(i,j,k) bind(c, name='__nv_usad')
+      !dir$ ignore_tkr (d) i, (d) j, (d) k
+      integer, value :: i,j,k
+    end function
+  end interface
   
   interface signbit
     attributes(device) integer(4) function signbitf(x) bind(c,name='__nv_signbitf')
@@ -450,85 +464,85 @@ implicit none
     end function
   end interface
 
-  interface __double2int_rn
-    attributes(device) integer function __double2int_rn(r) bind(c)
+  interface __double2int_rd
+    attributes(device) integer function __double2int_rd(r) bind(c, name='__nv_double2int_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2int_rz
-    attributes(device) integer function __double2int_rz(r) bind(c)
+  interface __double2int_rn
+    attributes(device) integer function __double2int_rn(r) bind(c, name='__nv_double2int_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2int_ru
-    attributes(device) integer function __double2int_ru(r) bind(c)
+    attributes(device) integer function __double2int_ru(r) bind(c, name='__nv_double2int_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2int_rd
-    attributes(device) integer function __double2int_rd(r) bind(c)
+  interface __double2int_rz
+    attributes(device) integer function __double2int_rz(r) bind(c, name='__nv_double2int_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2uint_rn
-    attributes(device) integer function __double2uint_rn(r) bind(c)
+  interface __double2uint_rd
+    attributes(device) integer function __double2uint_rd(r) bind(c, name='__nv_double2uint_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2uint_rz
-    attributes(device) integer function __double2uint_rz(r) bind(c)
+  interface __double2uint_rn
+    attributes(device) integer function __double2uint_rn(r) bind(c, name='__nv_double2uint_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2uint_ru
-    attributes(device) integer function __double2uint_ru(r) bind(c)
+    attributes(device) integer function __double2uint_ru(r) bind(c, name='__nv_double2uint_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2uint_rd
-    attributes(device) integer function __double2uint_rd(r) bind(c)
+  interface __double2uint_rz
+    attributes(device) integer function __double2uint_rz(r) bind(c, name='__nv_double2uint_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2float_rn
-    attributes(device) real function __double2float_rn(r) bind(c)
+    attributes(device) real function __double2float_rn(r) bind(c, name='__nv_double2float_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2float_rz
-    attributes(device) real function __double2float_rz(r) bind(c)
+    attributes(device) real function __double2float_rz(r) bind(c, name='__nv_double2float_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2float_ru
-    attributes(device) real function __double2float_ru(r) bind(c)
+    attributes(device) real function __double2float_ru(r) bind(c, name='__nv_double2float_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2float_rd
-    attributes(device) real function __double2float_rd(r) bind(c)
+    attributes(device) real function __double2float_rd(r) bind(c, name='__nv_double2float_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
@@ -625,57 +639,57 @@ implicit none
     end function
   end interface
 
-  interface __ll2double_rn
-    attributes(device) double precision function __ll2double_rn(i) bind(c)
+  interface __ll2double_rd
+    attributes(device) double precision function __ll2double_rd(i) bind(c, name='__nv_ll2double_rd')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
-  interface __ll2double_rz
-    attributes(device) double precision function __ll2double_rz(i) bind(c)
+  interface __ll2double_rn
+    attributes(device) double precision function __ll2double_rn(i) bind(c, name='__nv_ll2double_rn')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
   interface __ll2double_ru
-    attributes(device) double precision function __ll2double_ru(i) bind(c)
+    attributes(device) double precision function __ll2double_ru(i) bind(c, name='__nv_ll2double_ru')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
-  interface __ll2double_rd
-    attributes(device) double precision function __ll2double_rd(i) bind(c)
+  interface __ll2double_rz
+    attributes(device) double precision function __ll2double_rz(i) bind(c, name='__nv_ll2double_rz')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
-  interface __ull2double_rn
-    attributes(device) double precision function __ull2double_rn(i) bind(c)
+  interface __ull2double_rd
+    attributes(device) double precision function __ull2double_rd(i) bind(c, name='__nv_ull2double_rd')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
-  interface __ull2double_rz
-    attributes(device) double precision function __ull2double_rz(i) bind(c)
+  interface __ull2double_rn
+    attributes(device) double precision function __ull2double_rn(i) bind(c, name='__nv_ull2double_rn')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
   interface __ull2double_ru
-    attributes(device) double precision function __ull2double_ru(i) bind(c)
+    attributes(device) double precision function __ull2double_ru(i) bind(c, name='__nv_ull2double_ru')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
   end interface
 
-  interface __ull2double_rd
-    attributes(device) double precision function __ull2double_rd(i) bind(c)
+  interface __ull2double_rz
+    attributes(device) double precision function __ull2double_rz(i) bind(c, name='__nv_ull2double_rz')
       !dir$ ignore_tkr (d) i
       integer(8), value :: i
     end function
@@ -695,15 +709,15 @@ implicit none
     end function
   end interface
 
-  interface __dsqrt_ru
-    attributes(device) double precision function __dsqrt_ru(x) bind(c)
+  interface __dsqrt_rd
+    attributes(device) double precision function __dsqrt_rd(x) bind(c, name='__nv_dsqrt_rd')
       !dir$ ignore_tkr (d) x
       double precision, value :: x
     end function
   end interface
 
-  interface __dsqrt_rd
-    attributes(device) double precision function __dsqrt_rd(x) bind(c)
+  interface __dsqrt_ru
+    attributes(device) double precision function __dsqrt_ru(x) bind(c, name='__nv_dsqrt_ru')
       !dir$ ignore_tkr (d) x
       double precision, value :: x
     end function
diff --git a/flang/test/Driver/fopenmp-simd.f90 b/flang/test/Driver/fopenmp-simd.f90
new file mode 100644
index 0000000..b25adee
--- /dev/null
+++ b/flang/test/Driver/fopenmp-simd.f90
@@ -0,0 +1,59 @@
+! RUN: %flang -target x86_64-linux-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-darwin -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-freebsd -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-windows-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+
+! CHECK-OPENMP-SIMD-FLAG: "-fopenmp-simd"
+! CHECK-NO-LD-ANY-NOT: "-l{{(omp|gomp|iomp5)}}"
+
+! -fopenmp-simd enables openmp support only for simd constructs
+! RUN: %flang_fc1 -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fopenmp-simd -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! -fopenmp-simd should have no effect if -fopenmp is already set
+! RUN: %flang_fc1 -fopenmp %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+
+subroutine main
+  ! CHECK-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-OMP-SIMD: fir.do_loop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-NO-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.parallel
+  ! CHECK-OMP: omp.wsloop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  !$omp parallel do
+  do i = 1, 10
+    print *, "test"
+  end do
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP: omp.yield
+  ! CHECK-OMP: omp.terminator
+  !$omp end parallel do
+
+  ! CHECK-OMP-SIMD: omp.simd
+  ! CHECK-NO-OMP-SIMD-NOT: omp.simd
+  ! CHECK-OMP: omp.simd
+  !$omp simd
+  ! CHECK-OMP-SIMD: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  do i = 1, 10
+    print *, "test"
+  ! CHECK-OMP-SIMD: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP: omp.yield
+  end do
+end subroutine
diff --git a/flang/test/Fir/FirToSCF/iter-while.fir b/flang/test/Fir/FirToSCF/iter-while.fir
new file mode 100644
index 0000000..0de7aab
--- /dev/null
+++ b/flang/test/Fir/FirToSCF/iter-while.fir
@@ -0,0 +1,99 @@
+// RUN: fir-opt %s --fir-to-scf | FileCheck %s
+
+// CHECK-LABEL:   func.func @test_simple_iterate_while_1() -> (index, i1, i16, i32) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 11 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 22 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant 123 : i16
+// CHECK:           %[[VAL_5:.*]] = arith.constant 456 : i32
+// CHECK:           %[[VAL_6:.*]]:4 = scf.while (%[[VAL_7:.*]] = %[[VAL_0]], %[[VAL_8:.*]] = %[[VAL_3]], %[[VAL_9:.*]] = %[[VAL_4]], %[[VAL_10:.*]] = %[[VAL_5]]) : (index, i1, i16, i32) -> (index, i1, i16, i32) {
+// CHECK:             %[[VAL_11:.*]] = arith.cmpi sle, %[[VAL_7]], %[[VAL_1]] : index
+// CHECK:             %[[VAL_12:.*]] = arith.andi %[[VAL_11]], %[[VAL_8]] : i1
+// CHECK:             scf.condition(%[[VAL_12]]) %[[VAL_7]], %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] : index, i1, i16, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_13:.*]]: index, %[[VAL_14:.*]]: i1, %[[VAL_15:.*]]: i16, %[[VAL_16:.*]]: i32):
+// CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_2]] : index
+// CHECK:             %[[VAL_18:.*]] = arith.constant true
+// CHECK:             %[[VAL_19:.*]] = arith.constant 22 : i16
+// CHECK:             %[[VAL_20:.*]] = arith.constant 33 : i32
+// CHECK:             scf.yield %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]] : index, i1, i16, i32
+// CHECK:           }
+// CHECK:           return %[[VAL_21:.*]]#0, %[[VAL_21]]#1, %[[VAL_21]]#2, %[[VAL_21]]#3 : index, i1, i16, i32
+// CHECK:         }
+func.func @test_simple_iterate_while_1() -> (index, i1, i16, i32) {
+  %lo = arith.constant 11 : index
+  %up = arith.constant 22 : index
+  %step = arith.constant 2 : index
+  %ok = arith.constant 1 : i1
+  %val1 = arith.constant 123 : i16
+  %val2 = arith.constant 456 : i32
+
+  %res:4 = fir.iterate_while (%i = %lo to %up step %step) and (%c = %ok) iter_args(%v1 = %val1, %v2 = %val2) -> (index, i1, i16, i32) {
+    %new_c = arith.constant 1 : i1
+    %new_v1 = arith.constant 22 : i16
+    %new_v2 = arith.constant 33 : i32
+    fir.result %i, %new_c, %new_v1, %new_v2 : index, i1, i16, i32
+  }
+
+  return %res#0, %res#1, %res#2, %res#3 : index, i1, i16, i32
+}
+
+// CHECK-LABEL:   func.func @test_simple_iterate_while_2(
+// CHECK-SAME:        %[[ARG0:.*]]: index, %[[ARG1:.*]]: index, %[[ARG2:.*]]: i1, %[[ARG3:.*]]: i32) -> (index, i1, i32) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]]:3 = scf.while (%[[VAL_2:.*]] = %[[ARG0]], %[[VAL_3:.*]] = %[[ARG2]], %[[VAL_4:.*]] = %[[ARG3]]) : (index, i1, i32) -> (index, i1, i32) {
+// CHECK:             %[[VAL_5:.*]] = arith.cmpi sle, %[[VAL_2]], %[[ARG1]] : index
+// CHECK:             %[[VAL_6:.*]] = arith.andi %[[VAL_5]], %[[VAL_3]] : i1
+// CHECK:             scf.condition(%[[VAL_6]]) %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : index, i1, i32
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index, %[[VAL_8:.*]]: i1, %[[VAL_9:.*]]: i32):
+// CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_7]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_11:.*]] = arith.constant 123 : i32
+// CHECK:             %[[VAL_12:.*]] = arith.constant true
+// CHECK:             scf.yield %[[VAL_10]], %[[VAL_12]], %[[VAL_11]] : index, i1, i32
+// CHECK:           }
+// CHECK:           return %[[VAL_13:.*]]#0, %[[VAL_13]]#1, %[[VAL_13]]#2 : index, i1, i32
+// CHECK:         }
+func.func @test_simple_iterate_while_2(%start: index, %stop: index, %cond: i1, %val: i32) -> (index, i1, i32) {
+  %step = arith.constant 1 : index
+
+  %res:3 = fir.iterate_while (%i = %start to %stop step %step) and (%ok = %cond) iter_args(%x = %val) -> (index, i1, i32) {
+    %new_x = arith.constant 123 : i32
+    %new_ok = arith.constant 1 : i1
+    fir.result %i, %new_ok, %new_x : index, i1, i32
+  }
+
+  return %res#0, %res#1, %res#2 : index, i1, i32
+}
+
+// CHECK-LABEL:   func.func @test_zero_iterations() -> (index, i1, i8) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 10 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 5 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant 42 : i8
+// CHECK:           %[[VAL_5:.*]]:3 = scf.while (%[[VAL_6:.*]] = %[[VAL_0]], %[[VAL_7:.*]] = %[[VAL_3]], %[[VAL_8:.*]] = %[[VAL_4]]) : (index, i1, i8) -> (index, i1, i8) {
+// CHECK:             %[[VAL_9:.*]] = arith.cmpi sle, %[[VAL_6]], %[[VAL_1]] : index
+// CHECK:             %[[VAL_10:.*]] = arith.andi %[[VAL_9]], %[[VAL_7]] : i1
+// CHECK:             scf.condition(%[[VAL_10]]) %[[VAL_6]], %[[VAL_7]], %[[VAL_8]] : index, i1, i8
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: i1, %[[VAL_13:.*]]: i8):
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_11]], %[[VAL_2]] : index
+// CHECK:             scf.yield %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index, i1, i8
+// CHECK:           }
+// CHECK:           return %[[VAL_15:.*]]#0, %[[VAL_15]]#1, %[[VAL_15]]#2 : index, i1, i8
+// CHECK:         }
+func.func @test_zero_iterations() -> (index, i1, i8) {
+  %lo = arith.constant 10 : index
+  %up = arith.constant 5 : index
+  %step = arith.constant 1 : index
+  %ok = arith.constant 1 : i1
+  %x = arith.constant 42 : i8
+
+  %res:3 = fir.iterate_while (%i = %lo to %up step %step) and (%c = %ok) iter_args(%xv = %x) -> (index, i1, i8) {
+    fir.result %i, %c, %xv : index, i1, i8
+  }
+
+  return %res#0, %res#1, %res#2 : index, i1, i8
+}
diff --git a/flang/test/Integration/iso-fortran-binding.cpp b/flang/test/Integration/iso-fortran-binding.cpp
index aaafd7c..36ae35d 100644
--- a/flang/test/Integration/iso-fortran-binding.cpp
+++ b/flang/test/Integration/iso-fortran-binding.cpp
@@ -1,9 +1,9 @@
+// REQUIRES: clang
 // UNSUPPORTED: system-windows
-// RUN: split-file %s %t
-// RUN: chmod +x %t/runtest.sh
-// RUN: %t/runtest.sh %t %t/cppfile.cpp %flang | FileCheck %s
+// RUN: rm -rf %t && mkdir %t
+// RUN: %clangxx %isysroot -I%flang_include %s -o %t/a.out
+// RUN: %t/a.out | FileCheck %s
 
-//--- cppfile.cpp
 extern "C" {
 #include "ISO_Fortran_binding.h"
 }
@@ -15,19 +15,3 @@ int main() {
 }
 
 // CHECK: PASS
-// clang-format off
-//--- runtest.sh
-#!/bin/bash
-TMPDIR=$1
-CPPFILE=$2
-FLANG=$3
-BINDIR=`dirname $FLANG`
-CPPCOMP=$BINDIR/clang++
-if [ -x $CPPCOMP ]
-then
-  $CPPCOMP $CPPFILE -o $TMPDIR/a.out
-  $TMPDIR/a.out # should print "PASS"
-else
-  # No clang compiler, just pass by default
-  echo "PASS"
-fi
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 8f8bd9b..aef926b 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -474,3 +474,51 @@ end
 ! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>
 ! CHECK: hlfir.assign %{{.*}} to %{{.*}} : f64, !fir.ref<f64>
 ! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xf64>>
+
+subroutine sub26(i, j, k)
+  integer :: i, j, k
+  real(2), dimension(i,j,k), device :: d
+  real(4), dimension(i,j,k) :: hd
+
+  hd = d
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub26
+! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<?x?x?xf16>, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} -> !fir.ref<!fir.array<?x?x?xf16>>
+! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} : (!fir.ref<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.ref<!fir.array<?x?x?xf16>>)
+! CHECK: %[[HD:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub26Ehd"} : (!fir.ref<!fir.array<?x?x?xf32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<?x?x?xf32>>)
+! CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<?x?x?xf16>, %8, %13, %18 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.heap<!fir.array<?x?x?xf16>>)
+! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.array<?x?x?xf16>>, !fir.box<!fir.array<?x?x?xf16>>
+! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<?x?x?xf32> {
+! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
+! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}})  : (!fir.box<!fir.array<?x?x?xf16>>, index, index, index) -> !fir.ref<f16>
+! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
+! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
+! CHECK: hlfir.yield_element %[[CONV]] : f32
+! CHECK: }
+! CHECK: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<?x?x?xf32>, !fir.box<!fir.array<?x?x?xf32>> 
+
+subroutine sub27()
+  real(2), dimension(10, 20, 30), device :: d
+  real(4), dimension(10, 20, 30) :: hd
+
+  hd = d
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub27()
+! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<10x20x30xf16> {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} -> !fir.ref<!fir.array<10x20x30xf16>>
+! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} : (!fir.ref<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf16>>, !fir.ref<!fir.array<10x20x30xf16>>)
+! CHECK: %[[ALLOC_HD:.*]] = fir.alloca !fir.array<10x20x30xf32> {bindc_name = "hd", uniq_name = "_QFsub27Ehd"}
+! CHECK: %[[HD:.*]]:2 = hlfir.declare %[[ALLOC_HD]](%{{.*}}) {uniq_name = "_QFsub27Ehd"} : (!fir.ref<!fir.array<10x20x30xf32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf32>>, !fir.ref<!fir.array<10x20x30xf32>>)
+! CHECK: %[[ALLOC_TEMP:.*]] = fir.allocmem !fir.array<10x20x30xf16> {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC_TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.heap<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>)
+! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>
+! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<10x20x30xf32> {
+! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
+! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}})  : (!fir.heap<!fir.array<10x20x30xf16>>, index, index, index) -> !fir.ref<f16>
+! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
+! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
+! CHECK: hlfir.yield_element %[[CONV]] : f32
+! CHECK: }
+! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index ae33644..ab90dec 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -5,6 +5,7 @@
 attributes(global) subroutine devsub()
   implicit none
   integer :: ret
+  real(2) :: r2
   real(4) :: af
   real(8) :: ad
   integer(4) :: ai
@@ -58,6 +59,7 @@ attributes(global) subroutine devsub()
   res = __ffs(al)
   res = __brev(ai)
   resl = __brev(al)
+
   res = __clz(ai)
   res = __clz(al)  
   af = __cosf(af)
@@ -65,10 +67,33 @@ attributes(global) subroutine devsub()
   ad = __ddiv_rz(ad, ad)
   ad = __ddiv_ru(ad, ad)
   ad = __ddiv_rd(ad, ad)
+  af = __double2float_rn(ad)
+  af = __double2float_rz(ad)
+  af = __double2float_ru(ad)
+  af = __double2float_rd(ad)
+  ai = __double2int_rd(ad)
+  ai = __double2int_rn(ad)
+  ai = __double2int_ru(ad)
+  ai = __double2int_rz(ad)
+  ai = __double2uint_rd(ad)
+  ai = __double2uint_rn(ad)
+  ai = __double2uint_ru(ad)
+  ai = __double2uint_rz(ad)
   ai = __mul24(ai, ai)
   ai = __umul24(ai, ai)
   af = __powf(af, af)
-
+  ad = __dsqrt_rd(ad)
+  ad = __dsqrt_ru(ad)
+  ad = __ull2double_rd(al)
+  ad = __ull2double_rn(al)
+  ad = __ull2double_ru(al)
+  ad = __ull2double_rz(al)
+  r2 = __float2half_rn(af)
+  af = __half2float(r2)
+  ad = __ll2double_rd(al)
+  ad = __ll2double_rn(al)
+  ad = __ll2double_ru(al)
+  ad = __ll2double_rz(al)
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -122,9 +147,33 @@ end
 ! CHECK: %{{.*}} = fir.call @__nv_ddiv_rz(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64
 ! CHECK: %{{.*}} = fir.call @__nv_ddiv_ru(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64
 ! CHECK: %{{.*}} = fir.call @__nv_ddiv_rd(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_double2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_double2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_double2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_double2float_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_double2int_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2int_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2int_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2int_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2uint_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2uint_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2uint_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_double2uint_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
 ! CHECK: %{{.*}} = fir.call @__nv_mul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
 ! CHECK: %{{.*}} = fir.call @__nv_umul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
 ! CHECK: %{{.*}} = fir.call @__nv_powf(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32, f32) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ull2double_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ull2double_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ull2double_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ull2double_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_float2half_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f16
+! CHECK: %{{.*}} = fir.call @__nv_half2float(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f16) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_ll2double_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ll2double_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ll2double_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_ll2double_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
 
 subroutine host1()
   integer, device :: a(32)
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
new file mode 100644
index 0000000..10e4c27
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran procedures available in cudadevice module
+
+attributes(global) subroutine test_sad()
+  integer :: res
+  integer :: i, j, k
+  res = __sad(i, j, k)
+end subroutine
+
+! CHECK-LABEL: _QPtest_sad
+! CHECK: %{{.*}} = fir.call @__nv_sad(%{{.*}}, %{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32, i32) -> i32
+
+attributes(global) subroutine test_usad()
+  integer :: res
+  integer :: i, j, k
+  res = __usad(i, j, k)
+end subroutine
+
+! CHECK-LABEL: _QPtest_usad
+! CHECK: %{{.*}} = fir.call @__nv_usad(%{{.*}}, %{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32, i32) -> i32
diff --git a/flang/test/Lower/OpenMP/Todo/assumed-rank-privatization.f90 b/flang/test/Lower/OpenMP/Todo/assumed-rank-privatization.f90
new file mode 100644
index 0000000..e57833a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/assumed-rank-privatization.f90
@@ -0,0 +1,9 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Privatization of assumed rank variable
+subroutine assumedPriv(a)
+  integer :: a(..)
+
+  !$omp parallel private(a)
+  !$omp end parallel
+end
diff --git a/flang/test/Lower/OpenMP/atomic-update-reassoc.f90 b/flang/test/Lower/OpenMP/atomic-update-reassoc.f90
new file mode 100644
index 0000000..96ebb56
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-update-reassoc.f90
@@ -0,0 +1,75 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+subroutine f00(x, y)
+  implicit none
+  integer :: x, y
+
+  !$omp atomic update
+  x = ((x + 1) + y) + 2
+end
+
+!CHECK-LABEL: func.func @_QPf00
+!CHECK: %[[X:[0-9]+]]:2 = hlfir.declare %arg0
+!CHECK: %[[Y:[0-9]+]]:2 = hlfir.declare %arg1
+!CHECK: %c1_i32 = arith.constant 1 : i32
+!CHECK: %[[LOAD_Y:[0-9]+]] = fir.load %[[Y]]#0 : !fir.ref<i32>
+!CHECK: %[[Y_1:[0-9]+]] = arith.addi %c1_i32, %[[LOAD_Y]] : i32
+!CHECK: %c2_i32 = arith.constant 2 : i32
+!CHECK: %[[Y_1_2:[0-9]+]] = arith.addi %[[Y_1]], %c2_i32 : i32
+!CHECK: omp.atomic.update memory_order(relaxed) %[[X]]#0 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:arg[0-9]+]]: i32):
+!CHECK:   %[[ARG_P:[0-9]+]] = arith.addi %[[ARG]], %[[Y_1_2]] : i32
+!CHECK:   omp.yield(%[[ARG_P]] : i32)
+!CHECK: }
+
+
+subroutine f01(x, y)
+  implicit none
+  real :: x
+  integer :: y
+
+  !$omp atomic update
+  x = (int(x) + y) + 1
+end
+
+!CHECK-LABEL: func.func @_QPf01
+!CHECK: %[[X:[0-9]+]]:2 = hlfir.declare %arg0
+!CHECK: %[[Y:[0-9]+]]:2 = hlfir.declare %arg1
+!CHECK: %[[LOAD_Y:[0-9]+]] = fir.load %[[Y]]#0 : !fir.ref<i32>
+!CHECK: %c1_i32 = arith.constant 1 : i32
+!CHECK: %[[Y_1:[0-9]+]] = arith.addi %[[LOAD_Y]], %c1_i32 : i32
+!CHECK: omp.atomic.update memory_order(relaxed) %[[X]]#0 : !fir.ref<f32> {
+!CHECK: ^bb0(%[[ARG:arg[0-9]+]]: f32):
+!CHECK:   %[[ARG_I:[0-9]+]] = fir.convert %[[ARG]] : (f32) -> i32
+!CHECK:   %[[ARG_P:[0-9]+]] = arith.addi %[[ARG_I]], %[[Y_1]] : i32
+!CHECK:   %[[ARG_F:[0-9]+]] = fir.convert %[[ARG_P]] : (i32) -> f32
+!CHECK:   omp.yield(%[[ARG_F]] : f32)
+!CHECK: }
+
+
+subroutine f02(x, a, b, c)
+  implicit none
+  integer(kind=4) :: x
+  integer(kind=8) :: a, b, c
+
+  !$omp atomic update
+  x = ((b + a) + x) + c
+end
+
+!CHECK-LABEL: func.func @_QPf02
+!CHECK: %[[A:[0-9]+]]:2 = hlfir.declare %arg1
+!CHECK: %[[B:[0-9]+]]:2 = hlfir.declare %arg2
+!CHECK: %[[C:[0-9]+]]:2 = hlfir.declare %arg3
+!CHECK: %[[X:[0-9]+]]:2 = hlfir.declare %arg0
+!CHECK: %[[LOAD_B:[0-9]+]] = fir.load %[[B]]#0 : !fir.ref<i64>
+!CHECK: %[[LOAD_A:[0-9]+]] = fir.load %[[A]]#0 : !fir.ref<i64>
+!CHECK: %[[A_B:[0-9]+]] = arith.addi %[[LOAD_B]], %[[LOAD_A]] : i64
+!CHECK: %[[LOAD_C:[0-9]+]] = fir.load %[[C]]#0 : !fir.ref<i64>
+!CHECK: %[[A_B_C:[0-9]+]] = arith.addi %[[A_B]], %[[LOAD_C]] : i64
+!CHECK: omp.atomic.update memory_order(relaxed) %[[X]]#0 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:arg[0-9]+]]: i32):
+!CHECK:   %[[ARG_8:[0-9]+]] = fir.convert %[[ARG]] : (i32) -> i64
+!CHECK:   %[[ARG_P:[0-9]+]] = arith.addi %[[ARG_8]], %[[A_B_C]] : i64
+!CHECK:   %[[ARG_4:[0-9]+]] = fir.convert %[[ARG_P]] : (i64) -> i32
+!CHECK:   omp.yield(%[[ARG_4]] : i32)
+!CHECK: }
diff --git a/flang/test/Semantics/OpenACC/acc-branch.f90 b/flang/test/Semantics/OpenACC/acc-branch.f90
index a2d7b58..0a1bdc3 100644
--- a/flang/test/Semantics/OpenACC/acc-branch.f90
+++ b/flang/test/Semantics/OpenACC/acc-branch.f90
@@ -13,7 +13,7 @@ subroutine openacc_clause_validity
   !$acc parallel
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a PARALLEL construct
     return
   end do
@@ -21,21 +21,21 @@ subroutine openacc_clause_validity
 
   !$acc parallel loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a PARALLEL LOOP construct
     return
   end do
 
   !$acc serial loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a SERIAL LOOP construct
     return
   end do
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a KERNELS LOOP construct
     return
   end do
@@ -43,7 +43,7 @@ subroutine openacc_clause_validity
   !$acc parallel
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       exit
     end if
@@ -81,7 +81,7 @@ subroutine openacc_clause_validity
         exit fortname
         !$acc loop
           do i = 1, N
-            a(i) = 3.14
+            a(i) = 3.14d0
             if(i == N-1) THEN
               !ERROR: EXIT to construct 'name1' outside of PARALLEL construct is not allowed
               exit name1
@@ -89,7 +89,7 @@ subroutine openacc_clause_validity
           end do
 
           loop2: do i = 1, N
-            a(i) = 3.33
+            a(i) = 3.33d0
             !ERROR: EXIT to construct 'thisblk' outside of PARALLEL construct is not allowed
             exit thisblk
           end do loop2
@@ -102,7 +102,7 @@ subroutine openacc_clause_validity
   !$acc parallel
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     ifname: if (i == 2) then
       ! This is allowed.
       exit ifname
@@ -113,7 +113,7 @@ subroutine openacc_clause_validity
   !$acc parallel
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       stop 999 ! no error
     end if
@@ -122,7 +122,7 @@ subroutine openacc_clause_validity
 
   !$acc kernels
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a KERNELS construct
     return
   end do
@@ -130,7 +130,7 @@ subroutine openacc_clause_validity
 
   !$acc kernels
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       exit
     end if
@@ -139,7 +139,7 @@ subroutine openacc_clause_validity
 
   !$acc kernels
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       stop 999 ! no error
     end if
@@ -148,7 +148,7 @@ subroutine openacc_clause_validity
 
   !$acc serial
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     !ERROR: RETURN statement is not allowed in a SERIAL construct
     return
   end do
@@ -156,7 +156,7 @@ subroutine openacc_clause_validity
 
   !$acc serial
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       exit
     end if
@@ -168,7 +168,7 @@ subroutine openacc_clause_validity
   do i = 1, N
     ifname: if (.true.) then
       print *, "LGTM"
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
         !ERROR: EXIT to construct 'name2' outside of SERIAL construct is not allowed
         exit name2
@@ -181,7 +181,7 @@ subroutine openacc_clause_validity
 
   !$acc serial
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
     if(i == N-1) THEN
       stop 999 ! no error
     end if
diff --git a/flang/test/Semantics/OpenACC/acc-init-validity.f90 b/flang/test/Semantics/OpenACC/acc-init-validity.f90
index 083a241..bede04d 100644
--- a/flang/test/Semantics/OpenACC/acc-init-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-init-validity.f90
@@ -44,7 +44,7 @@ program openacc_init_validity
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -53,7 +53,7 @@ program openacc_init_validity
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
@@ -62,7 +62,7 @@ program openacc_init_validity
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels
 
@@ -70,21 +70,21 @@ program openacc_init_validity
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc serial loop
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop
   do i = 1, N
     !ERROR: Directive INIT may not be called within a compute region
     !$acc init
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: At most one IF clause can appear on the INIT directive
diff --git a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
index cfe27e4..65c6293 100644
--- a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
@@ -31,75 +31,75 @@ program openacc_kernels_loop_validity
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels loop
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels loop
 
   !$acc kernels loop num_gangs(8)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_gangs(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_gangs(8)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_workers(worker_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_workers(8)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop vector_length(vector_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop vector_length(128)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_gangs(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
 
   !$acc kernels loop if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop if(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !ERROR: Unmatched END SERIAL LOOP directive
   !$acc end serial loop
@@ -107,194 +107,194 @@ program openacc_kernels_loop_validity
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the KERNELS LOOP directive
   !$acc kernels loop device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels loop
 
   !$acc kernels loop async
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop async(1)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop async(async1)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(wait1)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(wait1, wait2)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(wait1) wait(wait2)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(1, 2) async(3)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(queues: 1, 2) async(3)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(devnum: 1: 1, 2) async(3)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop wait(devnum: 1: queues: 1, 2) async(3)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_gangs(8)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop num_workers(8)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop vector_length(128)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop if(.true.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop if(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: At most one IF clause can appear on the KERNELS LOOP directive
   !$acc kernels loop if(.true.) if(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop self
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop self(.true.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop self(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop copy(aa) copyin(bb) copyout(cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop copy(aa, bb) copyout(zero: cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop present(aa, bb) create(cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop copyin(readonly: aa, bb) create(zero: cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop deviceptr(aa, bb) no_create(cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: Argument `aa` on the ATTACH clause must be a variable or array with the POINTER or ALLOCATABLE attribute
   !$acc kernels loop attach(aa, dd, p)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop private(aa, bb, cc)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop default(none) private(N, a)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop default(none)
   !ERROR: The DEFAULT(NONE) clause requires that 'n' must be listed in a data-mapping clause
   do i = 1, N
     !ERROR: The DEFAULT(NONE) clause requires that 'a' must be listed in a data-mapping clause
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop default(present)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: At most one DEFAULT clause can appear on the KERNELS LOOP directive
   !$acc kernels loop default(none) default(present)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop device_type(*)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop device_type(multicore)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop device_type(host, multicore)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop device_type(*) async wait num_gangs(8) num_workers(8) vector_length(128)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop device_type(*) async
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the KERNELS LOOP directive
   !$acc kernels loop device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop
diff --git a/flang/test/Semantics/OpenACC/acc-kernels.f90 b/flang/test/Semantics/OpenACC/acc-kernels.f90
index 44e532a..9c3adfb 100644
--- a/flang/test/Semantics/OpenACC/acc-kernels.f90
+++ b/flang/test/Semantics/OpenACC/acc-kernels.f90
@@ -177,14 +177,14 @@ program openacc_kernels_validity
 
   !$acc kernels device_type(*) async
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels
 
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the KERNELS directive
   !$acc kernels device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels
 
diff --git a/flang/test/Semantics/OpenACC/acc-loop.f90 b/flang/test/Semantics/OpenACC/acc-loop.f90
index 9301cf8..77c427e 100644
--- a/flang/test/Semantics/OpenACC/acc-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-loop.f90
@@ -31,35 +31,35 @@ program openacc_loop_validity
   !$acc parallel
   !$acc loop tile(2)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel device_type(*) num_gangs(2)
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop seq
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop independent
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop auto
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -67,35 +67,35 @@ program openacc_loop_validity
   !ERROR: At most one VECTOR clause can appear on the LOOP directive or in group separated by the DEVICE_TYPE clause
   !$acc loop vector vector(128)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop vector
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop vector(10)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop vector(vector_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop vector(length: vector_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -103,35 +103,35 @@ program openacc_loop_validity
   !ERROR: At most one WORKER clause can appear on the LOOP directive or in group separated by the DEVICE_TYPE clause
   !$acc loop worker worker(10)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop worker
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop worker(10)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop worker(worker_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop worker(num: worker_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -139,58 +139,58 @@ program openacc_loop_validity
   !ERROR: At most one GANG clause can appear on the LOOP directive or in group separated by the DEVICE_TYPE clause
   !$acc loop gang gang(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc loop gang device_type(default) gang(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: At most one GANG clause can appear on the PARALLEL LOOP directive or in group separated by the DEVICE_TYPE clause
   !$acc parallel loop gang gang(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop gang device_type(default) gang(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel
   !$acc loop gang(gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop gang(num: gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop gang(gang_size, static:*)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop gang(num: gang_size, static:*)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel
   !$acc loop gang(num: gang_size, static: gang_size)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -222,7 +222,7 @@ program openacc_loop_validity
   !$acc loop collapse(-1)
   do i = 1, N
     do j = 1, N
-      a(i) = 3.14 + j
+      a(i) = 3.14d0 + j
     end do
   end do
   !$acc end parallel
@@ -231,7 +231,7 @@ program openacc_loop_validity
   !ERROR: Clause PRIVATE is not allowed after clause DEVICE_TYPE on the LOOP directive
   !$acc loop device_type(*) private(i)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -239,7 +239,7 @@ program openacc_loop_validity
   !ERROR: Clause GANG is not allowed if clause SEQ appears on the LOOP directive
   !$acc loop gang seq
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -247,7 +247,7 @@ program openacc_loop_validity
   !ERROR: Clause WORKER is not allowed if clause SEQ appears on the LOOP directive
   !$acc loop worker seq
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -255,7 +255,7 @@ program openacc_loop_validity
   !ERROR: Clause VECTOR is not allowed if clause SEQ appears on the LOOP directive
   !$acc loop vector seq
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -355,7 +355,7 @@ program openacc_loop_validity
   !$acc parallel device_type(*) if(.TRUE.)
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -363,7 +363,7 @@ program openacc_loop_validity
   do i = 1, N
     !ERROR: Loop control is not present in the DO LOOP
     do
-      a(i) = 3.14
+      a(i) = 3.14d0
     end do
   end do
 
diff --git a/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90 b/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90
index 78e1a7a..96962bb 100644
--- a/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90
@@ -19,64 +19,64 @@ program openacc_parallel_loop_validity
 
   !$acc parallel loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel loop
 
   !$acc parallel loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
   !$acc parallel loop tile(2)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop self
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: SELF clause on the PARALLEL LOOP directive only accepts optional scalar logical expression
   !$acc parallel loop self(bb, cc(:,:))
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop self(.true.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop self(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc parallel loop tile(2, 2)
   do i = 1, N
     do j = 1, N
-      aa(i, j) = 3.14
+      aa(i, j) = 3.14d0
     end do
   end do
 
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the PARALLEL LOOP directive
   !$acc parallel loop device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel loop
 
   !$acc kernels loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !ERROR: Unmatched END PARALLEL LOOP directive
   !$acc end parallel loop
diff --git a/flang/test/Semantics/OpenACC/acc-parallel.f90 b/flang/test/Semantics/OpenACC/acc-parallel.f90
index b9d989e..635c547 100644
--- a/flang/test/Semantics/OpenACC/acc-parallel.f90
+++ b/flang/test/Semantics/OpenACC/acc-parallel.f90
@@ -24,7 +24,7 @@ program openacc_parallel_validity
   !$acc parallel device_type(*) num_gangs(2)
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -149,7 +149,7 @@ program openacc_parallel_validity
   !$acc parallel device_type(*) if(.TRUE.)
   !$acc loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
diff --git a/flang/test/Semantics/OpenACC/acc-serial-loop.f90 b/flang/test/Semantics/OpenACC/acc-serial-loop.f90
index 5d2be7f..9f23a27 100644
--- a/flang/test/Semantics/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-serial-loop.f90
@@ -77,32 +77,32 @@ program openacc_serial_loop_validity
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the SERIAL LOOP directive
   !$acc serial loop device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial loop
 
   !$acc serial loop if(ifCondition)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial loop
 
   !$acc serial loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !ERROR: Unmatched END PARALLEL LOOP directive
   !$acc end parallel loop
 
   !$acc serial loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial loop
 
   !$acc serial loop
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
diff --git a/flang/test/Semantics/OpenACC/acc-serial.f90 b/flang/test/Semantics/OpenACC/acc-serial.f90
index f3b81c9..d50bdf9 100644
--- a/flang/test/Semantics/OpenACC/acc-serial.f90
+++ b/flang/test/Semantics/OpenACC/acc-serial.f90
@@ -39,7 +39,7 @@ program openacc_serial_validity
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
@@ -162,14 +162,14 @@ program openacc_serial_validity
 
   !$acc serial device_type(*) async
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the SERIAL directive
   !$acc serial device_type(*) if(.TRUE.)
   do i = 1, N
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
diff --git a/flang/test/Semantics/OpenACC/acc-set-validity.f90 b/flang/test/Semantics/OpenACC/acc-set-validity.f90
index 74522b3..3d514e1 100644
--- a/flang/test/Semantics/OpenACC/acc-set-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-set-validity.f90
@@ -31,7 +31,7 @@ program openacc_clause_validity
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -40,7 +40,7 @@ program openacc_clause_validity
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
@@ -49,7 +49,7 @@ program openacc_clause_validity
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels
 
@@ -57,21 +57,21 @@ program openacc_clause_validity
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc serial loop
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop
   do i = 1, N
     !ERROR: Directive SET may not be called within a compute region
     !$acc set default_async(i)
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !ERROR: At least one of DEFAULT_ASYNC, DEVICE_NUM, DEVICE_TYPE clause must appear on the SET directive
diff --git a/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90 b/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
index 163130d..fff630e 100644
--- a/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
@@ -32,7 +32,7 @@ program openacc_shutdown_validity
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end parallel
 
@@ -41,7 +41,7 @@ program openacc_shutdown_validity
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end serial
 
@@ -50,7 +50,7 @@ program openacc_shutdown_validity
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
   !$acc end kernels
 
@@ -58,21 +58,21 @@ program openacc_shutdown_validity
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc serial loop
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc kernels loop
   do i = 1, N
     !ERROR: Directive SHUTDOWN may not be called within a compute region
     !$acc shutdown
-    a(i) = 3.14
+    a(i) = 3.14d0
   end do
 
   !$acc shutdown
diff --git a/flang/test/Semantics/OpenMP/atomic-update-only.f90 b/flang/test/Semantics/OpenMP/atomic-update-only.f90
index 3c02792..8ae261c 100644
--- a/flang/test/Semantics/OpenMP/atomic-update-only.f90
+++ b/flang/test/Semantics/OpenMP/atomic-update-only.f90
@@ -28,11 +28,18 @@ end
 
 subroutine f03
   integer :: x, y
+  real :: xr, yr
 
+  !With integer type the reassociation should be able to bring the `x` to
+  !the top of the + operator. Expect no diagnostics.
   !$omp atomic update
-  !ERROR: The atomic variable x cannot be a proper subexpression of an argument (here: (x+y)) in the update operation
-  !ERROR: The atomic variable x should appear as an argument of the top-level + operator
   x = (x + y) + 1
+
+  !Real variables cannot be reassociated (unless fastmath options are present).
+  !$omp atomic update
+  !ERROR: The atomic variable xr cannot be a proper subexpression of an argument (here: (xr+yr)) in the update operation
+  !ERROR: The atomic variable xr should appear as an argument of the top-level + operator
+  xr = (xr + yr) + 1
 end
 
 subroutine f04
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index 8f8af31..002e06b 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -205,9 +205,8 @@ subroutine more_invalid_atomic_update_stmts()
     !ERROR: The atomic variable a should appear as an argument of the top-level + operator
         a = a * b + c
 
+    !This is expected to work due to reassociation.
     !$omp atomic update
-    !ERROR: The atomic variable a cannot be a proper subexpression of an argument (here: a+b) in the update operation
-    !ERROR: The atomic variable a should appear as an argument of the top-level + operator
         a = a + b + c
 
     !$omp atomic
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index e725e26..5f74978 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -21,8 +21,8 @@ use omp_lib
   integer(omp_allocator_handle_kind) :: xy_alloc
   xy_alloc = omp_init_allocator(xy_memspace, 1, xy_traits)
 
-  arrayA = 1.414
-  arrayB = 3.14
+  arrayA = 1.414d0
+  arrayB = 3.14d0
   N = 1024
 
 ! 2.5 parallel-clause -> if-clause |
diff --git a/flang/test/Semantics/OpenMP/combined-constructs.f90 b/flang/test/Semantics/OpenMP/combined-constructs.f90
index 2298d33..49da562 100644
--- a/flang/test/Semantics/OpenMP/combined-constructs.f90
+++ b/flang/test/Semantics/OpenMP/combined-constructs.f90
@@ -10,46 +10,46 @@ program main
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
   !$omp distribute simd
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end distribute simd
 
   !$omp target parallel device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !ERROR: At most one DEVICE clause can appear on the TARGET PARALLEL directive
   !$omp target parallel device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !$omp target parallel defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !ERROR: 'variable-category' modifier is required
   !$omp target parallel defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET PARALLEL directive
   !$omp target parallel defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !$omp target parallel map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
@@ -57,46 +57,46 @@ program main
   !ERROR: Non-THREADPRIVATE object 'a' in COPYIN clause
   !$omp target parallel copyin(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel
 
   !$omp target parallel do device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !ERROR: At most one DEVICE clause can appear on the TARGET PARALLEL DO directive
   !$omp target parallel do device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !$omp target parallel do defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !ERROR: 'variable-category' modifier is required
   !$omp target parallel do defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET PARALLEL DO directive
   !$omp target parallel do defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !$omp target parallel do map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
@@ -104,406 +104,406 @@ program main
   !ERROR: Non-THREADPRIVATE object 'a' in COPYIN clause
   !$omp target parallel do copyin(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target parallel do
 
   !$omp target teams map(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target teams device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: At most one DEVICE clause can appear on the TARGET TEAMS directive
   !$omp target teams device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: SCHEDULE clause is not allowed on the TARGET TEAMS directive
   !$omp target teams schedule(static)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target teams defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: 'variable-category' modifier is required
   !$omp target teams defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET TEAMS directive
   !$omp target teams defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target teams num_teams(3) thread_limit(10) default(shared) private(i) shared(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: At most one NUM_TEAMS clause can appear on the TARGET TEAMS directive
   !$omp target teams num_teams(2) num_teams(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: The parameter of the NUM_TEAMS clause must be a positive integer expression
   !$omp target teams num_teams(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TARGET TEAMS directive
   !$omp target teams thread_limit(2) thread_limit(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: The parameter of the THREAD_LIMIT clause must be a positive integer expression
   !$omp target teams thread_limit(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: At most one DEFAULT clause can appear on the TARGET TEAMS directive
   !$omp target teams default(shared) default(private)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target teams num_teams(2) defaultmap(tofrom:scalar)
   do i = 1, N
-      a(i) = 3.14
+      a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target teams map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
   !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS directive
   !$omp target teams map(delete:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams
 
 
   !$omp target teams distribute map(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: At most one DEVICE clause can appear on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: 'variable-category' modifier is required
   !$omp target teams distribute defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute num_teams(3) thread_limit(10) default(shared) private(i) shared(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: At most one NUM_TEAMS clause can appear on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute num_teams(2) num_teams(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: The parameter of the NUM_TEAMS clause must be a positive integer expression
   !$omp target teams distribute num_teams(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute thread_limit(2) thread_limit(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: The parameter of the THREAD_LIMIT clause must be a positive integer expression
   !$omp target teams distribute thread_limit(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: At most one DEFAULT clause can appear on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute default(shared) default(private)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute num_teams(2) defaultmap(tofrom:scalar)
   do i = 1, N
-      a(i) = 3.14
+      a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute map(delete:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute
 
   !$omp target teams distribute parallel do device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: At most one DEVICE clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: 'variable-category' modifier is required
   !$omp target teams distribute parallel do defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do num_teams(3) thread_limit(10) default(shared) private(i) shared(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: At most one NUM_TEAMS clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do num_teams(2) num_teams(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: The parameter of the NUM_TEAMS clause must be a positive integer expression
   !$omp target teams distribute parallel do num_teams(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do thread_limit(2) thread_limit(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: The parameter of the THREAD_LIMIT clause must be a positive integer expression
   !$omp target teams distribute parallel do thread_limit(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: At most one DEFAULT clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do default(shared) default(private)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do num_teams(2) defaultmap(tofrom:scalar)
   do i = 1, N
-      a(i) = 3.14
+      a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
   !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do map(delete:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do
 
 
   !$omp target teams distribute parallel do simd map(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute parallel do simd device(0)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: At most one DEVICE clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd device(0) device(1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute parallel do simd defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: 'variable-category' modifier is required
   !$omp target teams distribute parallel do simd defaultmap(tofrom)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute parallel do simd num_teams(3) thread_limit(10) default(shared) private(i) shared(a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: At most one NUM_TEAMS clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd num_teams(2) num_teams(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: The parameter of the NUM_TEAMS clause must be a positive integer expression
   !$omp target teams distribute parallel do simd num_teams(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd thread_limit(2) thread_limit(3)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: The parameter of the THREAD_LIMIT clause must be a positive integer expression
   !$omp target teams distribute parallel do simd thread_limit(-1)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: At most one DEFAULT clause can appear on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd default(shared) default(private)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute parallel do simd num_teams(2) defaultmap(tofrom:scalar)
   do i = 1, N
-      a(i) = 3.14
+      a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute parallel do simd map(tofrom:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
   !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd map(delete:a)
   do i = 1, N
-     a(i) = 3.14
+     a(i) = 3.14d0
   enddo
   !$omp end target teams distribute parallel do simd
 
diff --git a/flang/test/Semantics/OpenMP/device-constructs.f90 b/flang/test/Semantics/OpenMP/device-constructs.f90
index 431e0f8..a41c461 100644
--- a/flang/test/Semantics/OpenMP/device-constructs.f90
+++ b/flang/test/Semantics/OpenMP/device-constructs.f90
@@ -8,131 +8,131 @@ program main
   integer :: N
   type(c_ptr) :: cptr
 
-  arrayA = 1.414
-  arrayB = 3.14
+  arrayA = 1.414d0
+  arrayB = 3.14d0
   N = 256
 
   !$omp target map(arrayA)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp target device(0)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !ERROR: At most one DEVICE clause can appear on the TARGET directive
   !$omp target device(0) device(1)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !ERROR: SCHEDULE clause is not allowed on the TARGET directive
   !$omp target schedule(static)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp target defaultmap(tofrom:scalar)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp target defaultmap(tofrom)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !ERROR: At most one DEFAULTMAP clause can appear on the TARGET directive
   !$omp target defaultmap(tofrom:scalar) defaultmap(tofrom:scalar)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp target thread_limit(4)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TARGET directive
   !$omp target thread_limit(4) thread_limit(8)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp teams num_teams(3) thread_limit(10) default(shared) private(i) shared(a)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !ERROR: At most one NUM_TEAMS clause can appear on the TEAMS directive
   !$omp teams num_teams(2) num_teams(3)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !ERROR: The parameter of the NUM_TEAMS clause must be a positive integer expression
   !$omp teams num_teams(-1)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !ERROR: At most one THREAD_LIMIT clause can appear on the TEAMS directive
   !$omp teams thread_limit(2) thread_limit(3)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !ERROR: The parameter of the THREAD_LIMIT clause must be a positive integer expression
   !$omp teams thread_limit(-1)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !ERROR: At most one DEFAULT clause can appear on the TEAMS directive
   !$omp teams default(shared) default(private)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end teams
 
   !$omp target teams num_teams(2) defaultmap(tofrom:scalar)
   do i = 1, N
-      a = 3.14
+      a = 3.14d0
   enddo
   !$omp end target teams
 
   !$omp target map(tofrom:a)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET directive
   !$omp target map(delete:a)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target
 
   !$omp target data device(0) map(to:a)
   do i = 1, N
-    a = 3.14
+    a = 3.14d0
   enddo
   !$omp end target data
 
@@ -147,7 +147,7 @@ program main
   !ERROR: At least one of MAP, USE_DEVICE_ADDR, USE_DEVICE_PTR clause must appear on the TARGET DATA directive
   !$omp target data device(0)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end target data
 
@@ -183,7 +183,7 @@ program main
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
   !$omp distribute
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end target
@@ -192,7 +192,7 @@ program main
   !$omp teams
   !$omp distribute
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end teams
@@ -205,7 +205,7 @@ program main
   do i = 1, N
      do j = 1, N
         do k = 1, N
-           a = 3.14
+           a = 3.14d0
         enddo
      enddo
   enddo
@@ -219,7 +219,7 @@ program main
   do i = 1, N
      do j = 1, N
         do k = 1, N
-           a = 3.14
+           a = 3.14d0
         enddo
      enddo
   enddo
@@ -231,7 +231,7 @@ program main
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
   !$omp distribute dist_schedule(static, 2)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end target
@@ -240,7 +240,7 @@ program main
   !$omp teams
   !$omp distribute dist_schedule(static, 2)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end teams
@@ -251,7 +251,7 @@ program main
   !ERROR: At most one DIST_SCHEDULE clause can appear on the DISTRIBUTE directive
   !$omp distribute dist_schedule(static, 2) dist_schedule(static, 3)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end target
@@ -261,7 +261,7 @@ program main
   !ERROR: At most one DIST_SCHEDULE clause can appear on the DISTRIBUTE directive
   !$omp distribute dist_schedule(static, 2) dist_schedule(static, 3)
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end teams
diff --git a/flang/test/Semantics/OpenMP/nested-distribute.f90 b/flang/test/Semantics/OpenMP/nested-distribute.f90
index c212763..cb4aea3 100644
--- a/flang/test/Semantics/OpenMP/nested-distribute.f90
+++ b/flang/test/Semantics/OpenMP/nested-distribute.f90
@@ -6,15 +6,15 @@ program main
   real(8) :: arrayA(256), arrayB(256)
   integer :: N
 
-  arrayA = 1.414
-  arrayB = 3.14
+  arrayA = 1.414d0
+  arrayB = 3.14d0
   N = 256
 
   !$omp task
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
   !$omp distribute 
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end task
@@ -24,7 +24,7 @@ program main
       !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region.
       !$omp task
       do k = 1, N
-         a = 3.14
+         a = 3.14d0
       enddo
       !$omp end task
    enddo
@@ -34,7 +34,7 @@ program main
    do i = 1, N
       !$omp parallel
       do k = 1, N
-         a = 3.14
+         a = 3.14d0
       enddo
       !$omp end parallel
    enddo
@@ -44,7 +44,7 @@ program main
   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
   !$omp distribute 
   do i = 1, N
-     a = 3.14
+     a = 3.14d0
   enddo
   !$omp end distribute
   !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/reduction-assumed.f90 b/flang/test/Semantics/OpenMP/reduction-assumed.f90
new file mode 100644
index 0000000..0bc8cd31
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction-assumed.f90
@@ -0,0 +1,53 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+
+! Types for built in reductions must have types which are valid for the
+! initialization and combiner expressions in the spec. This implies assumed
+! rank and assumed size cannot be used.
+
+subroutine assumedRank1(a)
+  integer :: a(..)
+
+  ! ERROR: The type of 'a' is incompatible with the reduction operator.
+  !$omp parallel reduction(+:a)
+  !$omp end parallel
+end
+
+subroutine assumedRank2(a)
+  integer :: a(..)
+
+  ! ERROR: The type of 'a' is incompatible with the reduction operator.
+  !$omp parallel reduction(min:a)
+  !$omp end parallel
+end
+
+subroutine assumedRank3(a)
+  integer :: a(..)
+
+  ! ERROR: The type of 'a' is incompatible with the reduction operator.
+  !$omp parallel reduction(iand:a)
+  !$omp end parallel
+end
+
+subroutine assumedSize1(a)
+  integer :: a(*)
+
+  ! ERROR: Whole assumed-size array 'a' may not appear here without subscripts
+  !$omp parallel reduction(+:a)
+  !$omp end parallel
+end
+
+subroutine assumedSize2(a)
+  integer :: a(*)
+
+  ! ERROR: Whole assumed-size array 'a' may not appear here without subscripts
+  !$omp parallel reduction(max:a)
+  !$omp end parallel
+end
+
+subroutine assumedSize3(a)
+  integer :: a(*)
+
+  ! ERROR: Whole assumed-size array 'a' may not appear here without subscripts
+  !$omp parallel reduction(ior:a)
+  !$omp end parallel
+end
diff --git a/flang/test/Semantics/OpenMP/simd-only.f90 b/flang/test/Semantics/OpenMP/simd-only.f90
new file mode 100644
index 0000000..da42b10d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/simd-only.f90
@@ -0,0 +1,416 @@
+! RUN: %flang_fc1 -fopenmp-simd -fdebug-dump-parse-tree %s 2>&1 | FileCheck %s
+
+! Test that non-SIMD OpenMPConstructs are removed on the parse tree level
+! when -fopenmp-simd is specified.
+! Tests the logic in lib/Semantics/rewrite-parse-tree.cpp
+
+! CHECK-LABEL: Name = 'test_simd'
+subroutine test_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp simd
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_simd'
+subroutine test_do_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = do simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp do simd
+  do i = 1, 100
+  end do
+end subroutine
+
+
+! CHECK-LABEL: Name = 'test_parallel_do_simd'
+subroutine test_parallel_do_simd()
+  integer :: i
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = parallel do simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do simd
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_simd_scan'
+subroutine test_simd_scan()
+  integer :: i
+  real :: sum
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  !$omp simd reduction(inscan,+:sum)
+  do i = 1, N
+    sum = sum + a(i)
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK: OmpDirectiveName -> llvm::omp::Directive = scan
+    !$omp scan inclusive(sum)
+    sum       = sum + a(i)
+  end do
+
+end subroutine
+
+! CHECK-LABEL: Name = 'test_simd_atomic'
+subroutine test_simd_atomic()
+  integer :: i, x
+
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK: OmpLoopDirective -> llvm::omp::Directive = simd
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp simd
+  do i = 1, 100
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+  !$omp atomic write
+  x = i
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do'
+subroutine test_do()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_nested'
+subroutine test_do_nested()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+    do j = 1, 100
+    end do
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target'
+subroutine test_target()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target
+  do i = 1, 100
+  end do
+  !$omp end target
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_teams_distribute'
+subroutine test_target_teams_distribute()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target teams distribute
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target teams distribute
+  do i = 1, 100
+  end do
+  !$omp end target teams distribute
+end subroutine
+
+
+! CHECK-LABEL: Name = 'test_target_data'
+subroutine test_target_data()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target data
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp target data map(to: A) map(tofrom: B)
+  do i = 1, 100
+  end do
+  !$omp end target data
+end subroutine
+
+! CHECK-LABEL: Name = 'test_loop'
+subroutine test_loop()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = loop
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp loop bind(thread)
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_unroll'
+subroutine test_unroll()
+  integer :: i
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = unroll
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp unroll
+  do i = 1, 100
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_do_ordered'
+subroutine test_do_ordered()
+  integer :: i, x
+  x = 0
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp do ordered
+  do i = 1, 100
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = ordered
+  !$omp ordered
+  x = x + 1
+  !$omp end ordered
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_cancel'
+subroutine test_cancel()
+  integer :: i, x
+  x = 0
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do
+  do i = 1, 100
+  if (i == 10) then
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancelConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancel
+    !$omp cancel do
+  end if
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPCancellationPointConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = cancellation point
+  !$omp cancellation point do
+  end do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_scan'
+subroutine test_scan()
+  integer :: i, sum
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  !$omp parallel do reduction(inscan, +: sum)
+  do i = 1, n
+    sum = sum + i
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = scan
+    !$omp scan inclusive(sum)
+  end do
+  !$omp end parallel do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_map'
+subroutine test_target_map()
+  integer :: array(10)
+
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpLoopDirective -> llvm::omp::Directive = target
+  !$omp target map(tofrom: array(2:10))
+    array(2) = array(2) * 2
+  !$omp end target
+end subroutine
+
+! CHECK-LABEL: Name = 'test_sections'
+subroutine test_sections()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPSectionsConstruct
+  !$omp sections
+  ! CHECK-NOT: OpenMPConstruct -> OpenMPSectionConstruct
+  !$omp section
+  ! CHECK-NOT: OpenMPConstruct -> OpenMPSectionConstruct
+  !$omp section
+  !$omp end sections
+end subroutine
+
+! CHECK-LABEL: Name = 'test_threadprivate_mod'
+module test_threadprivate_mod
+  implicit none
+  ! CHECK: DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+  ! CHECK: Name = 'x'
+  ! CHECK: Name = 'y'
+  integer :: x, y
+  ! CHECK: DeclarationConstruct -> SpecificationConstruct -> OtherSpecificationStmt -> CommonStmt
+  ! CHECK: Name = 'x'
+  ! CHECK: Name = 'y'
+  common /vars/ x, y
+  ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPThreadprivate
+  !$omp threadprivate(/vars/)
+end module
+
+! CHECK-LABEL: Name = 'test_atomic'
+subroutine test_atomic()
+  real :: z, x, y
+  !$omp parallel private(tid, z)
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    !$omp atomic write
+      x = y
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'z=x'
+    !$omp atomic read
+      z = x
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+1._4'
+    !$omp atomic update
+      x = x + 1
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'z=x'
+    !$omp atomic read
+      z = x
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=x+y'
+    !$omp atomic capture
+      x   = x + y
+    !$omp end atomic
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_task_single_taskwait'
+subroutine test_task_single_taskwait()
+  integer :: x
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = single
+  !$omp single
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+    do i = 1, 5
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+      !$omp task
+      x = i
+      !$omp end task
+    end do
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskwait
+    !$omp taskwait
+  !$omp end single
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_task_taskyield_flush_barrier'
+subroutine test_task_taskyield_flush_barrier()
+  integer :: x, i
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = barrier
+    !$omp barrier
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = single
+    !$omp single
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      !$omp task
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+        ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskyield
+        !$omp taskyield
+        ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=i'
+        x = i
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPFlushConstruct -> OmpDirectiveSpecification
+        !$omp flush
+      !$omp end task
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = task
+      !$omp task
+        ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPFlushConstruct -> OmpDirectiveSpecification
+        !$omp flush
+      !$omp end task
+      ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+      ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = taskwait
+      !$omp taskwait
+    !$omp end single
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_master_masked'
+subroutine test_master_masked()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel private(tid)
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = masked
+    !$omp masked
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+    !$omp end masked
+    ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+    ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = master
+    !$omp master
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y=x'
+    y = x
+    !$omp end master
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: Name = 'test_critical'
+subroutine test_critical()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = parallel
+  !$omp parallel do private(i)
+  do i = 1, 4
+    !$omp critical(mylock)
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+    !$omp end critical(mylock)
+  end do
+  !$omp end parallel do
+end subroutine
+
+! CHECK-LABEL: Name = 'test_target_enter_exit_update_data'
+subroutine test_target_enter_exit_update_data()
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target enter data
+  !$omp target enter data map(to: A)
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target teams distribute parallel do
+  !$omp target teams distribute parallel do
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  do i = 1, n
+    ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=y'
+    x = y
+  end do
+  !$omp end target teams distribute parallel do
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target update
+  !$omp target update from(A)
+  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct -> OmpDirectiveSpecification
+  ! CHECK-NOT: OmpDirectiveName -> llvm::omp::Directive = target exit data
+  !$omp target exit data map(from: A)
+end subroutine
+
+! CHECK-LABEL: Name = 'test_declare_mapper'
+module test_declare_mapper
+  implicit none
+
+  type :: myvec_t
+    integer               :: len
+    real, allocatable     :: data(:)
+  end type myvec_t
+
+  ! CHECK-NOT: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareMapperConstruct
+  !$omp declare mapper(myvec_t :: v) map(v, v%data(1:v%len))
+end module
diff --git a/flang/test/Semantics/widening.f90 b/flang/test/Semantics/widening.f90
new file mode 100644
index 0000000..52090c1
--- /dev/null
+++ b/flang/test/Semantics/widening.f90
@@ -0,0 +1,48 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
+
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.00000001490116119384765625e-1_4 is inexact [-Wreal-constant-widening]
+real(8), parameter :: warning1 = 0.1
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.10000002384185791015625_4 is inexact [-Wreal-constant-widening]
+real(8) :: warning2 = 1.1
+real, parameter :: noWarning1 = 2.1
+real(8) :: noWarning2 = warning1
+real(8) :: noWarning3 = noWarning1
+real(8) :: noWarning4 = 3.125 ! exact
+real(8) :: noWarning5 = 4.1d0 ! explicit 'd'
+real(8) :: noWarning6 = 5.1_4 ! explicit suffix
+real(8) :: noWarning7 = real(6.1, 8) ! explicit conversion
+real(8) :: noWarning8 = real(7.1d0) ! explicit narrowing conversion
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 8.1000003814697265625_4 is inexact [-Wreal-constant-widening]
+real(8) :: warning3 = real(8.1) ! no-op conversion
+! WARNING: Default real literal in COMPLEX(8) context might need a kind suffix, as its rounded value (9.1000003814697265625_4,1.01000003814697265625e1_4) is inexact [-Wreal-constant-widening]
+complex(8), parameter :: warning4 = (9.1, 10.1)
+! WARNING: Default real literal in COMPLEX(8) context might need a kind suffix, as its rounded value (1.11000003814697265625e1_4,1.21000003814697265625e1_4) is inexact [-Wreal-constant-widening]
+complex(8) :: warning5 = (11.1, 12.1)
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value [REAL(4)::1.31000003814697265625e1_4] is inexact [-Wreal-constant-widening]
+real(8) :: warning6(1) = [ 13.1 ]
+real(8) warning7
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.41000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+data warning7/14.1/
+type derived
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.51000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+  real(8) :: warning8 = 15.1
+  real(8) :: noWarning9 = real(16.1, 8)
+  real :: noWarning10 = 17.1
+end type
+type(derived) dx
+real noWarning11
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.81000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+warning7 = 18.1
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 1.91000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+dx%warning8 = 19.1
+dx%noWarning10 = 20.1
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 2.11000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+dx = derived(21.1)
+dx = derived(22.125)
+noWarning11 = 23.1
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 2.41000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+print *, [real(8) :: 24.1]
+! WARNING: Default real literal in REAL(8) context might need a kind suffix, as its rounded value 2.51000003814697265625e1_4 is inexact [-Wreal-constant-widening]
+print *, [real(8) :: noWarning11, 25.1]
+print *, [real(8) :: noWarning1] ! ok
+end
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
new file mode 100644
index 0000000..0025d10
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -0,0 +1,196 @@
+// RUN: fir-opt --split-input-file --verify-diagnostics --omp-simd-only %s | FileCheck %s
+
+// Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
+// Tests the logic in flang/lib/Optimizer/OpenMP/SimdOnly.cpp
+
+// CHECK: omp.private
+// CHECK-LABEL: func.func @simd
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @simd(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK: omp.simd private
+  omp.simd private(@_QFEi_private_i32 %arg2 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: omp.loop_nest
+    omp.loop_nest (%arg4) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @simd_composite
+func.func @simd_composite(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop {
+      // CHECK: omp.simd
+      omp.simd {
+        // CHECK: omp.loop_nest
+        omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+          // CHECK: fir.store
+          fir.store %arg0 to %arg1 : !fir.ref<i32>
+          // CHECK: omp.yield
+          omp.yield
+        }
+      // CHECK-NOT: {omp.composite}
+      } {omp.composite}
+    } {omp.composite}
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-NOT: omp.private
+// CHECK-LABEL: func.func @parallel
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel private(@_QFEi_private_i32 %arg1 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: fir.convert
+    %15 = fir.convert %c1_i32 : (i32) -> index
+    // CHECK: fir.convert
+    %16 = fir.convert %c100000_i32 : (i32) -> index
+    // CHECK: fir.do_loop
+    %18:2 = fir.do_loop %arg4 = %15 to %16 step %c1 iter_args(%arg2 = %arg0) -> (index, i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      fir.result %arg4, %arg2 : index, i32
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+    }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_map(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_map(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK-NOT: omp.map.info
+  %3 = omp.map.info var_ptr(%arg6 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%3 -> %arg0 : !fir.ref<i32>) {
+    // CHECK: arith.constant
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg0 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @teams
+func.func @teams(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.teams
+  omp.teams {
+    // CHECK: fir.store
+    fir.store %arg0 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @distribute_simd
+func.func @distribute_simd(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.distribute
+  omp.distribute {
+    // CHECK: omp.simd
+    omp.simd {
+      // CHECK: omp.loop_nest
+      omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+        // CHECK: fir.store
+        fir.store %arg0 to %arg1 : !fir.ref<i32>
+        // CHECK: omp.yield
+        omp.yield
+      }
+    // CHECK-NOT: {omp.composite}
+    } {omp.composite}
+  // CHECK-NOT: {omp.composite}
+  } {omp.composite}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @threadprivate(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @threadprivate(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.threadprivate
+  %1 = omp.threadprivate %arg1 : !fir.ref<i32> -> !fir.ref<i32>
+  // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+  fir.store %arg0 to %1 : !fir.ref<i32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @multi_block(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
+  %false = arith.constant false
+  %c0_i32 = arith.constant 0 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: cf.cond_br %[[ARG_3]], ^[[BB1:.*]], ^[[BB2:.*]]
+    cf.cond_br %6, ^bb1, ^bb2
+  // CHECK: ^[[BB1]]
+  ^bb1:  // pred: ^bb0
+    // CHECK: fir.call
+    fir.call @_FortranAStopStatement(%c0_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  // CHECK: ^[[BB2]]
+  ^bb2:  // pred: ^bb0
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @map_info(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @map_info(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %1 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %13 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%13 -> %arg3 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg3 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  // CHECK-NOT: omp.map.info
+  %18 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(from) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  return
+}
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 7eb5767..9d81aa4 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -118,10 +118,11 @@ if config.flang_standalone_build:
                 "PATH", config.flang_llvm_tools_dir, append_path=True
             )
 
-# On MacOS, -isysroot is needed to build binaries.
+# On MacOS, some tests need -isysroot to build binaries.
 isysroot_flag = []
 if config.osx_sysroot:
     isysroot_flag = ["-isysroot", config.osx_sysroot]
+config.substitutions.append(("%isysroot", " ".join(isysroot_flag)))
 
 # Check for DEFAULT_SYSROOT, because when it is set -isysroot has no effect.
 if config.default_sysroot:
@@ -133,7 +134,6 @@ tools = [
     ToolSubst(
         "%flang",
         command=FindTool("flang"),
-        extra_args=isysroot_flag,
         unresolved="fatal",
     ),
     ToolSubst(
@@ -172,6 +172,11 @@ if config.flang_standalone_build:
 else:
     llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
 
+llvm_config.use_clang(required=False)
+
+# Clang may need the include path for ISO_fortran_binding.h.
+config.substitutions.append(("%flang_include", config.flang_headers_dir))
+
 # Enable libpgmath testing
 result = lit_config.params.get("LIBPGMATH")
 if result:
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index ae514401..cc1f4fa 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -6,6 +6,7 @@ import lit.util
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.llvm_plugin_ext = "@LLVM_PLUGIN_EXT@"
+config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_target_triple_env = "@LLVM_TARGET_TRIPLE_ENV@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
@@ -13,6 +14,7 @@ config.errc_messages = "@LLVM_LIT_ERRC_MESSAGES@"
 config.flang_obj_root = "@FLANG_BINARY_DIR@"
 config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
 config.flang_intrinsic_modules_dir = "@FLANG_INTRINSIC_MODULES_DIR@"
+config.flang_headers_dir = "@HEADER_BINARY_DIR@"
 config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
 config.flang_test_triple = "@FLANG_TEST_TARGET_TRIPLE@"
 config.flang_examples = @LLVM_BUILD_EXAMPLES@
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index edfc878..82dff26 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -520,7 +520,9 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
 
     if (emitFIR && useHLFIR) {
       // lower HLFIR to FIR
-      fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP,
+      fir::EnableOpenMP enableOmp =
+          enableOpenMP ? fir::EnableOpenMP::Full : fir::EnableOpenMP::None;
+      fir::createHLFIRToFIRPassPipeline(pm, enableOmp,
                                         llvm::OptimizationLevel::O2);
       if (mlir::failed(pm.run(mlirModule))) {
         llvm::errs() << "FATAL: lowering from HLFIR to FIR failed";
diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt
index 007d64d..00fd467 100644
--- a/libc/config/baremetal/aarch64/entrypoints.txt
+++ b/libc/config/baremetal/aarch64/entrypoints.txt
@@ -763,6 +763,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -792,6 +795,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 6c1f52a..9a7800c0 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -766,6 +766,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -795,6 +798,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index d141276..34b400b 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -766,6 +766,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -795,6 +798,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt
index 57c09f0..d45ec35 100644
--- a/libc/config/darwin/aarch64/entrypoints.txt
+++ b/libc/config/darwin/aarch64/entrypoints.txt
@@ -596,6 +596,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 9b207fd..1e12e9e 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -239,6 +239,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/config/gpu/amdgpu/entrypoints.txt b/libc/config/gpu/amdgpu/entrypoints.txt
index 8981190..4b6f333 100644
--- a/libc/config/gpu/amdgpu/entrypoints.txt
+++ b/libc/config/gpu/amdgpu/entrypoints.txt
@@ -622,6 +622,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/config/gpu/nvptx/entrypoints.txt b/libc/config/gpu/nvptx/entrypoints.txt
index dc23742..d24cc74 100644
--- a/libc/config/gpu/nvptx/entrypoints.txt
+++ b/libc/config/gpu/nvptx/entrypoints.txt
@@ -623,6 +623,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 5f68752..e71dc2e 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -850,6 +850,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -879,6 +882,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 47689a2..ec01030 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -466,6 +466,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index b9efadc..54ea983 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -869,6 +869,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -898,6 +901,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 458cb59..1ee10e6 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -901,6 +901,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
@@ -930,6 +933,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C++23 mixed bfloat16 and _Float128 entrypoints
     libc.src.math.bf16addf128
     libc.src.math.bf16divf128
+    libc.src.math.bf16fmaf128
     libc.src.math.bf16mulf128
     libc.src.math.bf16subf128
   )
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 00104a3..37a2ee2 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -312,6 +312,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
   libc.src.math.bf16div
   libc.src.math.bf16divf
   libc.src.math.bf16divl
+  libc.src.math.bf16fma
+  libc.src.math.bf16fmaf
+  libc.src.math.bf16fmal
   libc.src.math.bf16mul
   libc.src.math.bf16mulf
   libc.src.math.bf16mull
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 0c11640..a7edb08 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -34,6 +34,7 @@
 #include "math/cbrtf.h"
 #include "math/cos.h"
 #include "math/cosf.h"
+#include "math/cosf16.h"
 #include "math/erff.h"
 #include "math/exp.h"
 #include "math/exp10.h"
diff --git a/libc/shared/math/cosf16.h b/libc/shared/math/cosf16.h
new file mode 100644
index 0000000..8a19285
--- /dev/null
+++ b/libc/shared/math/cosf16.h
@@ -0,0 +1,28 @@
+//===-- Shared cosf16 function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_COSF16_H
+#define LLVM_LIBC_SHARED_MATH_COSF16_H
+
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/cosf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::cosf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_COSF16_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 2cd0645..f4a8ee0 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -391,6 +391,23 @@ add_header_library(
 )
 
 add_header_library(
+  cosf16
+  HDRS
+    cosf16.h
+  DEPENDS
+    .sincosf16_utils
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+)
+
+add_header_library(
   erff
   HDRS
     erff.h
@@ -699,3 +716,13 @@ add_header_library(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.common
 )
+
+add_header_library(
+  sincosf16_utils
+  HDRS
+    sincosf16_utils.h
+  DEPENDS
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.common
+)
diff --git a/libc/src/__support/math/cosf16.h b/libc/src/__support/math/cosf16.h
new file mode 100644
index 0000000..50c9a8f
--- /dev/null
+++ b/libc/src/__support/math/cosf16.h
@@ -0,0 +1,106 @@
+//===-- Implementation header for cosf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_COSF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_COSF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "sincosf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 cosf16(float16 x) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 4;
+
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> COSF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      {0x2b7c, 0x3bfc, 1, 0, 1},
+      {0x4ac1, 0x38b5, 1, 0, 0},
+      {0x5c49, 0xb8c6, 0, 1, 0},
+      {0x7acc, 0xa474, 0, 1, 0},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using namespace sincosf16_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  float xf = x;
+
+  // Range reduction:
+  // For |x| > pi/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * pi/32
+  //   k is an integer, |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32/pi)
+  //   y = x * 32/pi - k
+  //
+  // Once k and y are computed, we then deduce the answer by the cosine of sum
+  // formula:
+  //   cos(x) = cos((k + y) * pi/32)
+  //          = cos(k * pi/32) * cos(y * pi/32) -
+  //            sin(k * pi/32) * sin(y * pi/32)
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Handle exceptional values
+  if (auto r = COSF16_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // cos(+/-0) = 1
+  if (LIBC_UNLIKELY(x_abs == 0U))
+    return fputil::cast<float16>(1.0f);
+
+  // cos(+/-inf) = NaN, and cos(NaN) = NaN
+  if (xbits.is_inf_or_nan()) {
+    if (xbits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+
+    if (xbits.is_inf()) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+    }
+
+    return x + FPBits::quiet_nan().get_val();
+  }
+
+  float sin_k = 0.0f, cos_k = 0.0f, sin_y = 0.0f, cosm1_y = 0.0f;
+  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
+  // Since, cosm1_y = cos_y - 1, therefore:
+  //   cos(x) = cos_k * cos_y - sin_k * sin_y
+  //          = cos_k * (cos_y - 1 + 1) - sin_k * sin_y
+  //          = cos_k * cosm1_y - sin_k * sin_y + cos_k
+  return fputil::cast<float16>(fputil::multiply_add(
+      cos_k, cosm1_y, fputil::multiply_add(-sin_k, sin_y, cos_k)));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_COSF16_H
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/__support/math/sincosf16_utils.h
index 05cab09d..74f21fd 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/__support/math/sincosf16_utils.h
@@ -16,6 +16,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+namespace sincosf16_internal {
+
 // Lookup table for sin(k * pi / 32) with k = 0, ..., 63.
 // Table is generated with Sollya as follows:
 // > display = hexadecimmal;
@@ -66,7 +68,7 @@ LIBC_INLINE int32_t range_reduction_sincosf16(float x, float &y) {
   return static_cast<int32_t>(kd);
 }
 
-static LIBC_INLINE void sincosf16_poly_eval(int32_t k, float y, float &sin_k,
+LIBC_INLINE static void sincosf16_poly_eval(int32_t k, float y, float &sin_k,
                                             float &cos_k, float &sin_y,
                                             float &cosm1_y) {
 
@@ -107,6 +109,8 @@ LIBC_INLINE void sincospif16_eval(float xf, float &sin_k, float &cos_k,
   sincosf16_poly_eval(k, y, sin_k, cos_k, sin_y, cosm1_y);
 }
 
+} // namespace sincosf16_internal
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 2cf5ae5..8db5901 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -584,6 +584,11 @@ add_math_entrypoint_object(bf16divf)
 add_math_entrypoint_object(bf16divl)
 add_math_entrypoint_object(bf16divf128)
 
+add_math_entrypoint_object(bf16fma)
+add_math_entrypoint_object(bf16fmaf)
+add_math_entrypoint_object(bf16fmal)
+add_math_entrypoint_object(bf16fmaf128)
+
 add_math_entrypoint_object(bf16mul)
 add_math_entrypoint_object(bf16mulf)
 add_math_entrypoint_object(bf16mull)
diff --git a/libc/src/math/bf16fma.h b/libc/src/math/bf16fma.h
new file mode 100644
index 0000000..aa54956
--- /dev/null
+++ b/libc/src/math/bf16fma.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16fma -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16FMA_H
+#define LLVM_LIBC_SRC_MATH_BF16FMA_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16fma(double x, double y, double z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16FMA_H
diff --git a/libc/src/math/bf16fmaf.h b/libc/src/math/bf16fmaf.h
new file mode 100644
index 0000000..e8582bd
--- /dev/null
+++ b/libc/src/math/bf16fmaf.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16fmaf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16FMAF_H
+#define LLVM_LIBC_SRC_MATH_BF16FMAF_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16fmaf(float x, float y, float z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16FMAF_H
diff --git a/libc/src/math/bf16fmaf128.h b/libc/src/math/bf16fmaf128.h
new file mode 100644
index 0000000..4215e54
--- /dev/null
+++ b/libc/src/math/bf16fmaf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16fmaf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16FMAF128_H
+#define LLVM_LIBC_SRC_MATH_BF16FMAF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16fmaf128(float128 x, float128 y, float128 z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16FMAF128_H
diff --git a/libc/src/math/bf16fmal.h b/libc/src/math/bf16fmal.h
new file mode 100644
index 0000000..b92f17b
--- /dev/null
+++ b/libc/src/math/bf16fmal.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for bf16fmal ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_BF16FMAL_H
+#define LLVM_LIBC_SRC_MATH_BF16FMAL_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bfloat16 bf16fmal(long double x, long double y, long double z);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_BF16FMAL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 70e5bf6..e12bee4 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -278,16 +278,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.generic.add_sub
 )
 
-add_header_library(
-  sincosf16_utils
-  HDRS
-    sincosf16_utils.h
-  DEPENDS
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.common
-)
-
 add_entrypoint_object(
   cos
   SRCS
@@ -315,16 +305,7 @@ add_entrypoint_object(
   HDRS
     ../cosf16.h
   DEPENDS
-    .sincosf16_utils
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.cosf16
 )
 
 add_entrypoint_object(
@@ -349,7 +330,6 @@ add_entrypoint_object(
   HDRS
     ../cospif16.h
   DEPENDS
-    .sincosf16_utils
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -357,6 +337,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
+    libc.src.__support.math.sincosf16_utils
 )
 
 add_entrypoint_object(
@@ -405,7 +386,6 @@ add_entrypoint_object(
   HDRS
     ../sinf16.h
   DEPENDS
-    .sincosf16_utils
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -415,6 +395,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.types
+    libc.src.__support.math.sincosf16_utils
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
 )
@@ -482,7 +463,6 @@ add_entrypoint_object(
   HDRS
     ../sinpif16.h
   DEPENDS
-    .sincosf16_utils
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -490,6 +470,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
+    libc.src.__support.math.sincosf16_utils
 )
 
 add_entrypoint_object(
@@ -538,7 +519,6 @@ add_entrypoint_object(
   HDRS
     ../tanf16.h
   DEPENDS
-    .sincosf16_utils
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -548,6 +528,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.types
+    libc.src.__support.math.sincosf16_utils
 )
 
 add_entrypoint_object(
@@ -572,7 +553,6 @@ add_entrypoint_object(
   HDRS
     ../tanpif16.h
   DEPENDS
-    .sincosf16_utils
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -581,6 +561,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
+    libc.src.__support.math.sincosf16_utils
 )
 
 add_entrypoint_object(
@@ -5106,15 +5087,57 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
-  bf16sub
+  bf16fma
   SRCS
-    bf16sub.cpp
+    bf16fma.cpp
   HDRS
-    ../bf16sub.h
+    ../bf16fma.h
   DEPENDS
     libc.src.__support.common
     libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16fmaf
+  SRCS
+    bf16fmaf.cpp
+  HDRS
+    ../bf16fmaf.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16fmal
+  SRCS
+    bf16fmal.cpp
+  HDRS
+    ../bf16fmal.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
+  bf16fmaf128
+  SRCS
+    bf16fmaf128.cpp
+  HDRS
+    ../bf16fmaf128.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.fma
     libc.src.__support.macros.config
     libc.src.__support.macros.properties.types
 )
@@ -5176,6 +5199,20 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  bf16sub
+  SRCS
+    bf16sub.cpp
+  HDRS
+    ../bf16sub.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.config
+    libc.src.__support.macros.properties.types
+)
+
+add_entrypoint_object(
   bf16subf
   SRCS
     bf16subf.cpp
diff --git a/libc/src/math/generic/bf16fma.cpp b/libc/src/math/generic/bf16fma.cpp
new file mode 100644
index 0000000..0f0fe86
--- /dev/null
+++ b/libc/src/math/generic/bf16fma.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16fma function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16fma.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16fma, (double x, double y, double z)) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fmaf.cpp b/libc/src/math/generic/bf16fmaf.cpp
new file mode 100644
index 0000000..739691c
--- /dev/null
+++ b/libc/src/math/generic/bf16fmaf.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of bf16fmaf function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16fmaf.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16fmaf, (float x, float y, float z)) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fmaf128.cpp b/libc/src/math/generic/bf16fmaf128.cpp
new file mode 100644
index 0000000..a29a0b0
--- /dev/null
+++ b/libc/src/math/generic/bf16fmaf128.cpp
@@ -0,0 +1,22 @@
+//===-- Implementation of bf16fmaf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16fmaf128.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16fmaf128,
+                   (float128 x, float128 y, float128 z)) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fmal.cpp b/libc/src/math/generic/bf16fmal.cpp
new file mode 100644
index 0000000..f31ec69
--- /dev/null
+++ b/libc/src/math/generic/bf16fmal.cpp
@@ -0,0 +1,22 @@
+//===-- Implementation of bf16fmal function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/bf16fmal.h"
+
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bfloat16, bf16fmal,
+                   (long double x, long double y, long double z)) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/cosf16.cpp b/libc/src/math/generic/cosf16.cpp
index 99bb03e..031c3e1 100644
--- a/libc/src/math/generic/cosf16.cpp
+++ b/libc/src/math/generic/cosf16.cpp
@@ -7,87 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/cosf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/cosf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr size_t N_EXCEPTS = 4;
-
-constexpr fputil::ExceptValues<float16, N_EXCEPTS> COSF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    {0x2b7c, 0x3bfc, 1, 0, 1},
-    {0x4ac1, 0x38b5, 1, 0, 0},
-    {0x5c49, 0xb8c6, 0, 1, 0},
-    {0x7acc, 0xa474, 0, 1, 0},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, cosf16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-
-  uint16_t x_u = xbits.uintval();
-  uint16_t x_abs = x_u & 0x7fff;
-  float xf = x;
-
-  // Range reduction:
-  // For |x| > pi/32, we perform range reduction as follows:
-  // Find k and y such that:
-  //   x = (k + y) * pi/32
-  //   k is an integer, |y| < 0.5
-  //
-  // This is done by performing:
-  //   k = round(x * 32/pi)
-  //   y = x * 32/pi - k
-  //
-  // Once k and y are computed, we then deduce the answer by the cosine of sum
-  // formula:
-  //   cos(x) = cos((k + y) * pi/32)
-  //          = cos(k * pi/32) * cos(y * pi/32) -
-  //            sin(k * pi/32) * sin(y * pi/32)
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Handle exceptional values
-  if (auto r = COSF16_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // cos(+/-0) = 1
-  if (LIBC_UNLIKELY(x_abs == 0U))
-    return fputil::cast<float16>(1.0f);
-
-  // cos(+/-inf) = NaN, and cos(NaN) = NaN
-  if (xbits.is_inf_or_nan()) {
-    if (xbits.is_signaling_nan()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-
-    if (xbits.is_inf()) {
-      fputil::set_errno_if_required(EDOM);
-      fputil::raise_except_if_required(FE_INVALID);
-    }
-
-    return x + FPBits::quiet_nan().get_val();
-  }
-
-  float sin_k, cos_k, sin_y, cosm1_y;
-  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
-  // Since, cosm1_y = cos_y - 1, therefore:
-  //   cos(x) = cos_k * cos_y - sin_k * sin_y
-  //          = cos_k * (cos_y - 1 + 1) - sin_k * sin_y
-  //          = cos_k * cosm1_y - sin_k * sin_y + cos_k
-  return fputil::cast<float16>(fputil::multiply_add(
-      cos_k, cosm1_y, fputil::multiply_add(-sin_k, sin_y, cos_k)));
-}
+LLVM_LIBC_FUNCTION(float16, cosf16, (float16 x)) { return math::cosf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/cospif16.cpp b/libc/src/math/generic/cospif16.cpp
index 9dc2592..c99285b 100644
--- a/libc/src/math/generic/cospif16.cpp
+++ b/libc/src/math/generic/cospif16.cpp
@@ -9,16 +9,17 @@
 #include "src/math/cospif16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/macros/optimization.h"
+#include "src/__support/math/sincosf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, cospif16, (float16 x)) {
+  using namespace sincosf16_internal;
   using FPBits = typename fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/sinf16.cpp b/libc/src/math/generic/sinf16.cpp
index 28debbd..2b57920 100644
--- a/libc/src/math/generic/sinf16.cpp
+++ b/libc/src/math/generic/sinf16.cpp
@@ -9,13 +9,13 @@
 #include "src/math/sinf16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/macros/optimization.h"
+#include "src/__support/math/sincosf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -32,6 +32,7 @@ constexpr fputil::ExceptValues<float16, N_EXCEPTS> SINF16_EXCEPTS{{
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LLVM_LIBC_FUNCTION(float16, sinf16, (float16 x)) {
+  using namespace sincosf16_internal;
   using FPBits = fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/sinpif16.cpp b/libc/src/math/generic/sinpif16.cpp
index 68af484..311e6f9 100644
--- a/libc/src/math/generic/sinpif16.cpp
+++ b/libc/src/math/generic/sinpif16.cpp
@@ -9,15 +9,16 @@
 #include "src/math/sinpif16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/math/sincosf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, sinpif16, (float16 x)) {
+  using namespace sincosf16_internal;
   using FPBits = typename fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp
index 229f4a3..20323a8 100644
--- a/libc/src/math/generic/tanf16.cpp
+++ b/libc/src/math/generic/tanf16.cpp
@@ -9,13 +9,13 @@
 #include "src/math/tanf16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/macros/optimization.h"
+#include "src/__support/math/sincosf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -37,6 +37,7 @@ constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) {
+  using namespace sincosf16_internal;
   using FPBits = fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp
index 792d405..b137b09 100644
--- a/libc/src/math/generic/tanpif16.cpp
+++ b/libc/src/math/generic/tanpif16.cpp
@@ -9,13 +9,13 @@
 #include "src/math/tanpif16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "sincosf16_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/macros/optimization.h"
+#include "src/__support/math/sincosf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -39,6 +39,7 @@ constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANPIF16_EXCEPTS{{
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) {
+  using namespace sincosf16_internal;
   using FPBits = typename fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 052be9a..a8f17d3 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -30,6 +30,7 @@ add_fp_unittest(
     libc.src.__support.math.cbrtf
     libc.src.__support.math.cos
     libc.src.__support.math.cosf
+    libc.src.__support.math.cosf16
     libc.src.__support.math.erff
     libc.src.__support.math.exp
     libc.src.__support.math.exp10
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 26e6a1b..971e1b7 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -21,7 +21,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::asinhf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanhf16(0.0f16));
-
+  EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::cosf16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
 
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 85dddfb..11bbf67 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -3085,6 +3085,70 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  bf16fma_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16fma_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.bf16fma
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmaf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16fmaf_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.bf16fmaf
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmal_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16fmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.bf16fmal
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmaf128_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    bf16fmaf128_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.bf16fmaf128
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
   bf16mul_test
   NEED_MPFR
   SUITE
diff --git a/libc/test/src/math/bf16fma_test.cpp b/libc/test/src/math/bf16fma_test.cpp
new file mode 100644
index 0000000..81c73a0c
--- /dev/null
+++ b/libc/test/src/math/bf16fma_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fma ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fma.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16fma)
diff --git a/libc/test/src/math/bf16fmaf128_test.cpp b/libc/test/src/math/bf16fmaf128_test.cpp
new file mode 100644
index 0000000..dd8f473
--- /dev/null
+++ b/libc/test/src/math/bf16fmaf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmaf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmaf128.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16fmaf128)
diff --git a/libc/test/src/math/bf16fmaf_test.cpp b/libc/test/src/math/bf16fmaf_test.cpp
new file mode 100644
index 0000000..04c6748
--- /dev/null
+++ b/libc/test/src/math/bf16fmaf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmaf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmaf.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16fmaf)
diff --git a/libc/test/src/math/bf16fmal_test.cpp b/libc/test/src/math/bf16fmal_test.cpp
new file mode 100644
index 0000000..4c45e2c
--- /dev/null
+++ b/libc/test/src/math/bf16fmal_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmal --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmal.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16fmal)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index c3df8b1..00881bd 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -5681,6 +5681,66 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  bf16fma_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16fma_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16fma
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmaf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16fmaf_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16fmaf
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmal_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16fmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16fmal
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
+  bf16fmaf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    bf16fmaf128_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.bf16fmaf128
+    libc.src.__support.FPUtil.bfloat16
+)
+
+add_fp_unittest(
   bf16div_test
   SUITE
     libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/bf16fma_test.cpp b/libc/test/src/math/smoke/bf16fma_test.cpp
new file mode 100644
index 0000000..81c73a0c
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16fma_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fma ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fma.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, double, LIBC_NAMESPACE::bf16fma)
diff --git a/libc/test/src/math/smoke/bf16fmaf128_test.cpp b/libc/test/src/math/smoke/bf16fmaf128_test.cpp
new file mode 100644
index 0000000..dd8f473
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16fmaf128_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmaf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmaf128.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, float128, LIBC_NAMESPACE::bf16fmaf128)
diff --git a/libc/test/src/math/smoke/bf16fmaf_test.cpp b/libc/test/src/math/smoke/bf16fmaf_test.cpp
new file mode 100644
index 0000000..04c6748
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16fmaf_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmaf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmaf.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, float, LIBC_NAMESPACE::bf16fmaf)
diff --git a/libc/test/src/math/smoke/bf16fmal_test.cpp b/libc/test/src/math/smoke/bf16fmal_test.cpp
new file mode 100644
index 0000000..4c45e2c
--- /dev/null
+++ b/libc/test/src/math/smoke/bf16fmal_test.cpp
@@ -0,0 +1,14 @@
+//===-- Unittests for bf16fmal --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/math/bf16fmal.h"
+
+LIST_NARROWING_FMA_TESTS(bfloat16, long double, LIBC_NAMESPACE::bf16fmal)
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 57e818c..3ab129a 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -486,6 +486,21 @@ explain_ternary_operation_one_output_error(Operation,
                                            float16, double, RoundingMode);
 #endif
 
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<float> &, bfloat16, double, RoundingMode);
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<double> &, bfloat16, double, RoundingMode);
+template void
+explain_ternary_operation_one_output_error(Operation,
+                                           const TernaryInput<long double> &,
+                                           bfloat16, double, RoundingMode);
+#if defined(LIBC_TYPES_HAS_FLOAT128) &&                                        \
+    defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<float128> &, bfloat16, double, RoundingMode);
+#endif // defined(LIBC_TYPES_HAS_FLOAT128) &&
+       // defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+
 template <typename InputType, typename OutputType>
 bool compare_unary_operation_single_output(Operation op, InputType input,
                                            OutputType libc_result,
@@ -734,6 +749,27 @@ compare_ternary_operation_one_output(Operation,
                                      double, RoundingMode);
 #endif
 
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<float> &,
+                                                   bfloat16, double,
+                                                   RoundingMode);
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<double> &,
+                                                   bfloat16, double,
+                                                   RoundingMode);
+template bool
+compare_ternary_operation_one_output(Operation,
+                                     const TernaryInput<long double> &,
+                                     bfloat16, double, RoundingMode);
+
+#if defined(LIBC_TYPES_HAS_FLOAT128) &&                                        \
+    defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+template bool
+compare_ternary_operation_one_output(Operation, const TernaryInput<float128> &,
+                                     bfloat16, double, RoundingMode);
+#endif // defined(LIBC_TYPES_HAS_FLOAT128) &&
+       // defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+
 } // namespace internal
 
 template <typename T> bool round_to_long(T x, long &result) {
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 5460664..082a86a 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -71,7 +71,7 @@
 "`LWG4079 <https://wg21.link/LWG4079>`__","Missing Preconditions in ``concat_view::iterator``\`s conversion constructor","2024-06 (St. Louis)","","",""
 "`LWG4082 <https://wg21.link/LWG4082>`__","``views::concat(r)`` is well-formed when ``r`` is an ``output_range``","2024-06 (St. Louis)","","",""
 "`LWG4083 <https://wg21.link/LWG4083>`__","``views::as_rvalue`` should reject non-input ranges","2024-06 (St. Louis)","","",""
-"`LWG4096 <https://wg21.link/LWG4096>`__","``views::iota(views::iota(0))`` should be rejected","2024-06 (St. Louis)","","",""
+"`LWG4096 <https://wg21.link/LWG4096>`__","``views::iota(views::iota(0))`` should be rejected","2024-06 (St. Louis)","|Complete|","22",""
 "`LWG4098 <https://wg21.link/LWG4098>`__","``views::adjacent<0>`` should reject non-forward ranges","2024-06 (St. Louis)","","",""
 "`LWG4105 <https://wg21.link/LWG4105>`__","``ranges::ends_with``\`s Returns misses difference casting","2024-06 (St. Louis)","","",""
 "`LWG4106 <https://wg21.link/LWG4106>`__","``basic_format_args`` should not be default-constructible","2024-06 (St. Louis)","|Complete|","19",""
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 4b84585..32ff340 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -30,6 +30,7 @@
 #include <__ranges/movable_box.h>
 #include <__ranges/view_interface.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/decay.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/type_identity.h>
@@ -374,10 +375,10 @@ namespace views {
 namespace __iota {
 struct __fn {
   template <class _Start>
+    requires(requires(_Start __s) { ranges::iota_view<decay_t<_Start>>(std::forward<_Start>(__s)); })
   _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Start&& __start) const
-      noexcept(noexcept(ranges::iota_view(std::forward<_Start>(__start))))
-          -> decltype(ranges::iota_view(std::forward<_Start>(__start))) {
-    return ranges::iota_view(std::forward<_Start>(__start));
+      noexcept(noexcept(ranges::iota_view<decay_t<_Start>>(std::forward<_Start>(__start)))) {
+    return ranges::iota_view<decay_t<_Start>>(std::forward<_Start>(__start));
   }
 
   template <class _Start, class _BoundSentinel>
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 6dadd09..2d89250 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -47,6 +47,30 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
+_LIBCPP_DIAGNOSTIC_PUSH
+// GCC complains about the backslashes at the end, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121528
+_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wcomment")
+// __tree is a red-black-tree implementation used for the associative containers (i.e. (multi)map/set). It stores
+// - (1) a pointer to the node with the smallest (i.e. leftmost) element, namely __begin_node_
+// - (2) the number of nodes in the tree, namely __size_
+// - (3) a pointer to the root of the tree, namely __end_node_
+//
+// Storing (1) and (2) is required to allow for constant time lookups. A tree looks like this in memory:
+//
+//      __end_node_
+//           |
+//          root
+//         /    \
+//       l1       r1
+//      /  \     /  \
+//    ...  ... ...  ...
+//
+// All nodes except __end_node_ have a __left_ and __right_ pointer as well as a __parent_ pointer.
+// __end_node_ only contains a __left_ pointer, which points to the root of the tree.
+// This layout allows for iteration through the tree without a need for special handling of the end node. See
+// __tree_next_iter and __tree_prev_iter for more details.
+_LIBCPP_DIAGNOSTIC_POP
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Pointer>
@@ -167,6 +191,11 @@ _LIBCPP_HIDE_FROM_ABI _NodePtr __tree_next(_NodePtr __x) _NOEXCEPT {
   return __x->__parent_unsafe();
 }
 
+// __tree_next_iter and __tree_prev_iter implement iteration through the tree. The order is as follows:
+// left sub-tree -> node -> right sub-tree. When the right-most node of a sub-tree is reached, we walk up the tree until
+// we find a node where we were in the left sub-tree. We are _always_ in a left sub-tree, since the __end_node_ points
+// to the actual root of the tree through a __left_ pointer. Incrementing the end() pointer is UB, so we can assume that
+// never happens.
 template <class _EndNodePtr, class _NodePtr>
 inline _LIBCPP_HIDE_FROM_ABI _EndNodePtr __tree_next_iter(_NodePtr __x) _NOEXCEPT {
   _LIBCPP_ASSERT_INTERNAL(__x != nullptr, "node shouldn't be null");
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index f86b272..9a20bb6 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -320,7 +320,7 @@ public:
   _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s, size_type __len) _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__len != 0 && __s == nullptr, " if len is not zero")
       : __data_(__s), __size_(__len) {
-#  if _LIBCPP_STD_VER >= 14
+#  if !defined(_LIBCPP_CXX03_LANG) && (!defined(_LIBCPP_COMPILER_GCC) || _LIBCPP_STD_VER >= 14)
     // Allocations must fit in `ptrdiff_t` for pointer arithmetic to work. If `__len` exceeds it, the input
     // range could not have been valid. Most likely the caller underflowed some arithmetic and inadvertently
     // passed in a negative length.
@@ -502,7 +502,6 @@ public:
   // find
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s.data(), __pos, __s.size());
   }
 
@@ -527,7 +526,6 @@ public:
   // rfind
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   rfind(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s.data(), __pos, __s.size());
   }
 
@@ -553,7 +551,6 @@ public:
   // find_first_of
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
@@ -580,7 +577,6 @@ public:
   // find_last_of
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
@@ -607,8 +603,6 @@ public:
   // find_first_not_of
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_not_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(
-        __s.size() == 0 || __s.data() != nullptr, "string_view::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
@@ -635,8 +629,6 @@ public:
   // find_last_not_of
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_not_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_NULL(
-        __s.size() == 0 || __s.data() != nullptr, "string_view::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
diff --git a/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp b/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
index af8b393..e47b5f5 100644
--- a/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
+++ b/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: c++03, c++11
+// UNSUPPORTED: c++03
+// UNSUPPORTED: c++11 && gcc
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
@@ -17,10 +18,17 @@
 #include <string_view>
 
 #include "check_assertion.h"
+#include "test_macros.h"
+
+// We're testing for assertions here, so let's not diagnose the misuses at compile time
+// FIXME: This should really be in ADDITIONAL_COMPILE_FLAGS, but it that doesn't work due to a Clang bug
+TEST_CLANG_DIAGNOSTIC_IGNORED("-Wnonnull")
 
 int main(int, char**) {
   char c = 0;
   TEST_LIBCPP_ASSERT_FAILURE(
       std::string_view(&c, -1), "string_view::string_view(_CharT *, size_t): length does not fit in difference_type");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::string_view(nullptr, 1), "string_view::string_view(_CharT *, size_t): received nullptr");
   return 0;
 }
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
index a6b268f..4f06b26 100644
--- a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
@@ -46,6 +46,9 @@ constexpr void testType(U u) {
 
 struct X {};
 
+template <typename IntT>
+concept CanDoubleWrap = requires(IntT i) { std::views::iota(std::views::iota(i)); };
+
 constexpr bool test() {
   testType<SomeInt>(SomeInt(10));
   testType<SomeInt>(IntComparableWith(SomeInt(10)));
@@ -68,6 +71,15 @@ constexpr bool test() {
   {
     static_assert(std::same_as<decltype(std::views::iota), decltype(std::ranges::views::iota)>);
   }
+  { // LWG4096: views::iota(views::iota(0)) should be rejected
+    static_assert(!CanDoubleWrap<int>);
+    static_assert(!CanDoubleWrap<SomeInt>);
+
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), decltype(std::views::iota(0))>);
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), decltype(std::views::iota(82))>);
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), decltype(std::views::iota(SomeInt(0)))>);
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), decltype(std::views::iota(SomeInt(94)))>);
+  }
 
   return true;
 }
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index ccaee8c..d8ba8e5 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -4,7 +4,7 @@ x-versions: &compiler_versions
 
 x-image-versions: &image_versions
   BASE_IMAGE: ubuntu:jammy
-  ACTIONS_BASE_IMAGE: builder-base
+  ACTIONS_BASE_IMAGE: ghcr.io/llvm/libcxx-linux-builder-base:77cb0980bcc2675b27d08141526939423fa0be76
 
 services:
   builder-base:
@@ -23,7 +23,7 @@ services:
       dockerfile: Dockerfile
       target: actions-builder
       args:
-        GITHUB_RUNNER_VERSION: "2.326.0"
+        GITHUB_RUNNER_VERSION: "2.328.0"
         <<: [*image_versions, *compiler_versions]
 
   android-buildkite-builder:
diff --git a/lldb/include/lldb/Protocol/MCP/Protocol.h b/lldb/include/lldb/Protocol/MCP/Protocol.h
index 141d064..49f9490 100644
--- a/lldb/include/lldb/Protocol/MCP/Protocol.h
+++ b/lldb/include/lldb/Protocol/MCP/Protocol.h
@@ -86,6 +86,10 @@ bool operator==(const Notification &, const Notification &);
 
 /// A general message as defined by the JSON-RPC 2.0 spec.
 using Message = std::variant<Request, Response, Notification>;
+// With clang-cl and MSVC STL 202208, convertible can be false later if we do
+// not force it to be checked early here.
+static_assert(std::is_convertible_v<Message, Message>,
+              "Message is not convertible to itself");
 
 bool fromJSON(const llvm::json::Value &, Message &, llvm::json::Path);
 llvm::json::Value toJSON(const Message &);
diff --git a/lldb/source/Core/Value.cpp b/lldb/source/Core/Value.cpp
index 028f058..86327e3 100644
--- a/lldb/source/Core/Value.cpp
+++ b/lldb/source/Core/Value.cpp
@@ -488,9 +488,11 @@ Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
     address = m_value.ULongLong(LLDB_INVALID_ADDRESS);
     address_type = eAddressTypeHost;
     if (exe_ctx) {
-      Target *target = exe_ctx->GetTargetPtr();
-      if (target) {
-        data.SetByteOrder(target->GetArchitecture().GetByteOrder());
+      if (Target *target = exe_ctx->GetTargetPtr()) {
+        // Registers are always stored in host endian.
+        data.SetByteOrder(m_context_type == ContextType::RegisterInfo
+                              ? endian::InlHostByteOrder()
+                              : target->GetArchitecture().GetByteOrder());
         data.SetAddressByteSize(target->GetArchitecture().GetAddressByteSize());
         break;
       }
diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.cpp b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
index a179260c..0fbb48a 100644
--- a/lldb/source/Plugins/Platform/Android/AdbClient.cpp
+++ b/lldb/source/Plugins/Platform/Android/AdbClient.cpp
@@ -8,61 +8,48 @@
 
 #include "AdbClient.h"
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FileUtilities.h"
-
 #include "lldb/Host/ConnectionFileDescriptor.h"
 #include "lldb/Host/FileSystem.h"
-#include "lldb/Host/PosixApi.h"
-#include "lldb/Utility/DataBuffer.h"
-#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/Connection.h"
 #include "lldb/Utility/DataEncoder.h"
 #include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/Timeout.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileUtilities.h"
+#include <chrono>
 
 #include <climits>
-
-#include <algorithm>
 #include <cstdlib>
 #include <fstream>
 #include <sstream>
 
-// On Windows, transitive dependencies pull in <Windows.h>, which defines a
-// macro that clashes with a method name.
-#ifdef SendMessage
-#undef SendMessage
-#endif
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::platform_android;
 using namespace std::chrono;
+using namespace llvm;
 
-static const seconds kReadTimeout(20);
+static const char *kSocketNamespaceAbstract = "localabstract";
+static const char *kSocketNamespaceFileSystem = "localfilesystem";
+const seconds kReadTimeout(20);
 static const char *kOKAY = "OKAY";
 static const char *kFAIL = "FAIL";
 static const char *kDATA = "DATA";
 static const char *kDONE = "DONE";
-
 static const char *kSEND = "SEND";
 static const char *kRECV = "RECV";
 static const char *kSTAT = "STAT";
-
 static const size_t kSyncPacketLen = 8;
-// Maximum size of a filesync DATA packet.
 static const size_t kMaxPushData = 2 * 1024;
-// Default mode for pushed files.
-static const uint32_t kDefaultMode = 0100770; // S_IFREG | S_IRWXU | S_IRWXG
-
-static const char *kSocketNamespaceAbstract = "localabstract";
-static const char *kSocketNamespaceFileSystem = "localfilesystem";
+static const uint32_t kDefaultMode = 0100770;
 
 static Status ReadAllBytes(Connection &conn, void *buffer, size_t size) {
-
   Status error;
   ConnectionStatus status;
   char *read_buffer = static_cast<char *>(buffer);
@@ -85,86 +72,215 @@ static Status ReadAllBytes(Connection &conn, void *buffer, size_t size) {
     error = Status::FromErrorStringWithFormat(
         "Unable to read requested number of bytes. Connection status: %d.",
         status);
+
   return error;
 }
 
-Status AdbClient::CreateByDeviceID(const std::string &device_id,
-                                   AdbClient &adb) {
-  Status error;
-  std::string android_serial;
-  if (!device_id.empty())
-    android_serial = device_id;
-  else if (const char *env_serial = std::getenv("ANDROID_SERIAL"))
-    android_serial = env_serial;
+static Status ReadAdbMessage(Connection &conn, std::vector<char> &message) {
+  message.clear();
 
-  if (android_serial.empty()) {
-    DeviceIDList connected_devices;
-    error = adb.GetDevices(connected_devices);
-    if (error.Fail())
-      return error;
+  char buffer[5];
+  buffer[4] = 0;
+
+  auto error = ReadAllBytes(conn, buffer, 4);
+  if (error.Fail())
+    return error;
+
+  unsigned int packet_len = 0;
+  sscanf(buffer, "%x", &packet_len);
+
+  message.resize(packet_len, 0);
+  error = ReadAllBytes(conn, &message[0], packet_len);
+  if (error.Fail())
+    message.clear();
 
-    if (connected_devices.size() != 1)
-      return Status::FromErrorStringWithFormat(
-          "Expected a single connected device, got instead %zu - try "
-          "setting 'ANDROID_SERIAL'",
-          connected_devices.size());
-    adb.SetDeviceID(connected_devices.front());
-  } else {
-    adb.SetDeviceID(android_serial);
-  }
   return error;
 }
 
-AdbClient::AdbClient() = default;
-
-AdbClient::AdbClient(const std::string &device_id) : m_device_id(device_id) {}
+static Status GetResponseError(Connection &conn, const char *response_id) {
+  if (strcmp(response_id, kFAIL) != 0)
+    return Status::FromErrorStringWithFormat(
+        "Got unexpected response id from adb: \"%s\"", response_id);
 
-AdbClient::~AdbClient() = default;
+  std::vector<char> error_message;
+  auto error = ReadAdbMessage(conn, error_message);
+  if (!error.Success())
+    return error;
 
-void AdbClient::SetDeviceID(const std::string &device_id) {
-  m_device_id = device_id;
+  std::string error_str(&error_message[0], error_message.size());
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log, "ADB error: %s", error_str.c_str());
+  return Status(error_str);
 }
 
-const std::string &AdbClient::GetDeviceID() const { return m_device_id; }
+static Status ReadResponseStatus(Connection &conn) {
+  char response_id[5];
 
-Status AdbClient::Connect() {
+  const size_t packet_len = 4;
+  response_id[packet_len] = 0;
+
+  auto error = ReadAllBytes(conn, response_id, packet_len);
+  if (error.Fail())
+    return error;
+
+  if (strncmp(response_id, kOKAY, packet_len) != 0)
+    return GetResponseError(conn, response_id);
+
+  return error;
+}
+
+static Status SendAdbMessage(Connection &conn, llvm::StringRef packet) {
   Status error;
-  m_conn = std::make_unique<ConnectionFileDescriptor>();
+
+  char length_buffer[5];
+  snprintf(length_buffer, sizeof(length_buffer), "%04x",
+           static_cast<int>(packet.size()));
+
+  ConnectionStatus status;
+
+  conn.Write(length_buffer, 4, status, &error);
+  if (error.Fail())
+    return error;
+
+  conn.Write(packet.str().c_str(), packet.size(), status, &error);
+  return error;
+}
+
+static Status ConnectToAdb(Connection &conn) {
   std::string port = "5037";
-  if (const char *env_port = std::getenv("ANDROID_ADB_SERVER_PORT")) {
+  if (const char *env_port = std::getenv("ANDROID_ADB_SERVER_PORT"))
     port = env_port;
-  }
   std::string uri = "connect://127.0.0.1:" + port;
-  m_conn->Connect(uri.c_str(), &error);
 
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log, "Connecting to ADB server at %s", uri.c_str());
+
+  Status error;
+  conn.Connect(uri.c_str(), &error);
   return error;
 }
 
-Status AdbClient::GetDevices(DeviceIDList &device_list) {
-  device_list.clear();
-
-  auto error = SendMessage("host:devices");
+static Status EnterSyncMode(Connection &conn) {
+  auto error = SendAdbMessage(conn, "sync:");
   if (error.Fail())
     return error;
 
-  error = ReadResponseStatus();
+  return ReadResponseStatus(conn);
+}
+
+static Status SelectTargetDevice(Connection &conn, llvm::StringRef device_id) {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOG(log, "Selecting device: {0}", device_id);
+
+  std::ostringstream msg;
+  msg << "host:transport:" << device_id.str();
+
+  auto error = SendAdbMessage(conn, msg.str());
   if (error.Fail())
     return error;
 
-  std::vector<char> in_buffer;
-  error = ReadMessage(in_buffer);
+  return ReadResponseStatus(conn);
+}
 
-  llvm::StringRef response(&in_buffer[0], in_buffer.size());
-  llvm::SmallVector<llvm::StringRef, 4> devices;
-  response.split(devices, "\n", -1, false);
+Expected<std::string> AdbClient::ResolveDeviceID(StringRef device_id) {
+  StringRef preferred_serial;
+  if (!device_id.empty())
+    preferred_serial = device_id;
+  else if (const char *env_serial = std::getenv("ANDROID_SERIAL"))
+    preferred_serial = env_serial;
 
-  for (const auto &device : devices)
-    device_list.push_back(std::string(device.split('\t').first));
+  if (preferred_serial.empty()) {
+    DeviceIDList connected_devices;
 
-  // Force disconnect since ADB closes connection after host:devices response
-  // is sent.
-  m_conn.reset();
-  return error;
+    auto GetDevices = [](DeviceIDList &device_list) -> Status {
+      device_list.clear();
+
+      // Create temporary ADB client for this operation only
+      auto temp_conn = std::make_unique<ConnectionFileDescriptor>();
+      auto error = ConnectToAdb(*temp_conn);
+      if (error.Fail())
+        return error;
+
+      // NOTE: ADB closes the connection after host:devices response.
+      // The connection is no longer valid
+      error = SendAdbMessage(*temp_conn, "host:devices");
+      if (error.Fail())
+        return error;
+
+      error = ReadResponseStatus(*temp_conn);
+      if (error.Fail())
+        return error;
+
+      std::vector<char> in_buffer;
+      error = ReadAdbMessage(*temp_conn, in_buffer);
+
+      StringRef response(&in_buffer[0], in_buffer.size());
+      SmallVector<StringRef, 4> devices;
+      response.split(devices, "\n", -1, false);
+
+      for (const auto &device : devices)
+        device_list.push_back(std::string(device.split('\t').first));
+      return error;
+    };
+
+    Status error = GetDevices(connected_devices);
+    if (error.Fail())
+      return error.ToError();
+
+    if (connected_devices.size() != 1)
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Expected a single connected device, got instead %zu - try "
+          "setting 'ANDROID_SERIAL'",
+          connected_devices.size());
+
+    std::string resolved_device_id = std::move(connected_devices.front());
+    Log *log = GetLog(LLDBLog::Platform);
+    LLDB_LOGF(log, "AdbClient::ResolveDeviceID Resolved device ID: %s",
+              resolved_device_id.c_str());
+    return resolved_device_id;
+  }
+
+  std::string resolved_device_id = preferred_serial.str();
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log, "AdbClient::ResolveDeviceID Resolved device ID: %s",
+            resolved_device_id.c_str());
+  return resolved_device_id;
+}
+
+AdbClient::AdbClient(llvm::StringRef device_id) : m_device_id(device_id) {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log,
+            "AdbClient::AdbClient(device_id='%s') - Creating AdbClient with "
+            "device ID",
+            device_id.str().c_str());
+  m_conn = std::make_unique<ConnectionFileDescriptor>();
+  Connect();
+}
+
+AdbClient::AdbClient() {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(
+      log,
+      "AdbClient::AdbClient() - Creating AdbClient with default constructor");
+  m_conn = std::make_unique<ConnectionFileDescriptor>();
+  Connect();
+}
+
+AdbClient::~AdbClient() {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log,
+            "AdbClient::~AdbClient() - Destroying AdbClient for device: %s",
+            m_device_id.c_str());
+}
+
+llvm::StringRef AdbClient::GetDeviceID() const { return m_device_id; }
+
+Status AdbClient::Connect() {
+  if (m_conn->IsConnected())
+    return Status();
+
+  return ConnectToAdb(*m_conn);
 }
 
 Status AdbClient::SetPortForwarding(const uint16_t local_port,
@@ -177,7 +293,7 @@ Status AdbClient::SetPortForwarding(const uint16_t local_port,
   if (error.Fail())
     return error;
 
-  return ReadResponseStatus();
+  return ReadResponseStatus(*m_conn);
 }
 
 Status
@@ -196,7 +312,7 @@ AdbClient::SetPortForwarding(const uint16_t local_port,
   if (error.Fail())
     return error;
 
-  return ReadResponseStatus();
+  return ReadResponseStatus(*m_conn);
 }
 
 Status AdbClient::DeletePortForwarding(const uint16_t local_port) {
@@ -207,56 +323,13 @@ Status AdbClient::DeletePortForwarding(const uint16_t local_port) {
   if (error.Fail())
     return error;
 
-  return ReadResponseStatus();
-}
-
-Status AdbClient::SendMessage(const std::string &packet, const bool reconnect) {
-  Status error;
-  if (!m_conn || reconnect) {
-    error = Connect();
-    if (error.Fail())
-      return error;
-  }
-
-  char length_buffer[5];
-  snprintf(length_buffer, sizeof(length_buffer), "%04x",
-           static_cast<int>(packet.size()));
-
-  ConnectionStatus status;
-
-  m_conn->Write(length_buffer, 4, status, &error);
-  if (error.Fail())
-    return error;
-
-  m_conn->Write(packet.c_str(), packet.size(), status, &error);
-  return error;
+  return ReadResponseStatus(*m_conn);
 }
 
-Status AdbClient::SendDeviceMessage(const std::string &packet) {
+Status AdbClient::SendDeviceMessage(llvm::StringRef packet) {
   std::ostringstream msg;
-  msg << "host-serial:" << m_device_id << ":" << packet;
-  return SendMessage(msg.str());
-}
-
-Status AdbClient::ReadMessage(std::vector<char> &message) {
-  message.clear();
-
-  char buffer[5];
-  buffer[4] = 0;
-
-  auto error = ReadAllBytes(buffer, 4);
-  if (error.Fail())
-    return error;
-
-  unsigned int packet_len = 0;
-  sscanf(buffer, "%x", &packet_len);
-
-  message.resize(packet_len, 0);
-  error = ReadAllBytes(&message[0], packet_len);
-  if (error.Fail())
-    message.clear();
-
-  return error;
+  msg << "host-serial:" << m_device_id << ":" << packet.str();
+  return SendAdbMessage(*m_conn, msg.str());
 }
 
 Status AdbClient::ReadMessageStream(std::vector<char> &message,
@@ -264,6 +337,9 @@ Status AdbClient::ReadMessageStream(std::vector<char> &message,
   auto start = steady_clock::now();
   message.clear();
 
+  if (!m_conn)
+    return Status::FromErrorString("No connection available");
+
   Status error;
   lldb::ConnectionStatus status = lldb::eConnectionStatusSuccess;
   char buffer[1024];
@@ -282,87 +358,22 @@ Status AdbClient::ReadMessageStream(std::vector<char> &message,
   return error;
 }
 
-Status AdbClient::ReadResponseStatus() {
-  char response_id[5];
-
-  static const size_t packet_len = 4;
-  response_id[packet_len] = 0;
-
-  auto error = ReadAllBytes(response_id, packet_len);
-  if (error.Fail())
-    return error;
-
-  if (strncmp(response_id, kOKAY, packet_len) != 0)
-    return GetResponseError(response_id);
-
-  return error;
-}
-
-Status AdbClient::GetResponseError(const char *response_id) {
-  if (strcmp(response_id, kFAIL) != 0)
-    return Status::FromErrorStringWithFormat(
-        "Got unexpected response id from adb: \"%s\"", response_id);
-
-  std::vector<char> error_message;
-  auto error = ReadMessage(error_message);
-  if (!error.Success())
-    return error;
-  return Status(std::string(&error_message[0], error_message.size()));
-}
-
-Status AdbClient::SwitchDeviceTransport() {
-  std::ostringstream msg;
-  msg << "host:transport:" << m_device_id;
-
-  auto error = SendMessage(msg.str());
-  if (error.Fail())
-    return error;
-
-  return ReadResponseStatus();
-}
-
-Status AdbClient::StartSync() {
-  auto error = SwitchDeviceTransport();
-  if (error.Fail())
-    return Status::FromErrorStringWithFormat(
-        "Failed to switch to device transport: %s", error.AsCString());
-
-  error = Sync();
-  if (error.Fail())
-    return Status::FromErrorStringWithFormat("Sync failed: %s",
-                                             error.AsCString());
-
-  return error;
-}
-
-Status AdbClient::Sync() {
-  auto error = SendMessage("sync:", false);
-  if (error.Fail())
-    return error;
-
-  return ReadResponseStatus();
-}
-
-Status AdbClient::ReadAllBytes(void *buffer, size_t size) {
-  return ::ReadAllBytes(*m_conn, buffer, size);
-}
-
 Status AdbClient::internalShell(const char *command, milliseconds timeout,
                                 std::vector<char> &output_buf) {
   output_buf.clear();
 
-  auto error = SwitchDeviceTransport();
+  auto error = SelectTargetDevice(*m_conn, m_device_id);
   if (error.Fail())
     return Status::FromErrorStringWithFormat(
-        "Failed to switch to device transport: %s", error.AsCString());
+        "Failed to select target device: %s", error.AsCString());
 
   StreamString adb_command;
   adb_command.Printf("shell:%s", command);
-  error = SendMessage(std::string(adb_command.GetString()), false);
+  error = SendAdbMessage(*m_conn, std::string(adb_command.GetString()));
   if (error.Fail())
     return error;
 
-  error = ReadResponseStatus();
+  error = ReadResponseStatus(*m_conn);
   if (error.Fail())
     return error;
 
@@ -417,18 +428,8 @@ Status AdbClient::ShellToFile(const char *command, milliseconds timeout,
   return Status();
 }
 
-std::unique_ptr<AdbClient::SyncService>
-AdbClient::GetSyncService(Status &error) {
-  std::unique_ptr<SyncService> sync_service;
-  error = StartSync();
-  if (error.Success())
-    sync_service.reset(new SyncService(std::move(m_conn)));
-
-  return sync_service;
-}
-
-Status AdbClient::SyncService::internalPullFile(const FileSpec &remote_file,
-                                                const FileSpec &local_file) {
+Status AdbSyncService::PullFileImpl(const FileSpec &remote_file,
+                                    const FileSpec &local_file) {
   const auto local_file_path = local_file.GetPath();
   llvm::FileRemover local_file_remover(local_file_path);
 
@@ -462,8 +463,8 @@ Status AdbClient::SyncService::internalPullFile(const FileSpec &remote_file,
   return error;
 }
 
-Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
-                                                const FileSpec &remote_file) {
+Status AdbSyncService::PushFileImpl(const FileSpec &local_file,
+                                    const FileSpec &remote_file) {
   const auto local_file_path(local_file.GetPath());
   std::ifstream src(local_file_path.c_str(), std::ios::in | std::ios::binary);
   if (!src.is_open())
@@ -487,7 +488,9 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
                                                error.AsCString());
   }
   error = SendSyncRequest(
-      kDONE, llvm::sys::toTimeT(FileSystem::Instance().GetModificationTime(local_file)),
+      kDONE,
+      llvm::sys::toTimeT(
+          FileSystem::Instance().GetModificationTime(local_file)),
       nullptr);
   if (error.Fail())
     return error;
@@ -500,7 +503,7 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
                                              error.AsCString());
   if (response_id == kFAIL) {
     std::string error_message(data_len, 0);
-    error = ReadAllBytes(&error_message[0], data_len);
+    error = ReadAllBytes(*m_conn, &error_message[0], data_len);
     if (error.Fail())
       return Status::FromErrorStringWithFormat(
           "Failed to read DONE error message: %s", error.AsCString());
@@ -518,9 +521,8 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file,
   return error;
 }
 
-Status AdbClient::SyncService::internalStat(const FileSpec &remote_file,
-                                            uint32_t &mode, uint32_t &size,
-                                            uint32_t &mtime) {
+Status AdbSyncService::StatImpl(const FileSpec &remote_file, uint32_t &mode,
+                                uint32_t &size, uint32_t &mtime) {
   const std::string remote_file_path(remote_file.GetPath(false));
   auto error = SendSyncRequest(kSTAT, remote_file_path.length(),
                                remote_file_path.c_str());
@@ -532,7 +534,7 @@ Status AdbClient::SyncService::internalStat(const FileSpec &remote_file,
   static const size_t response_len = stat_len + (sizeof(uint32_t) * 3);
 
   std::vector<char> buffer(response_len);
-  error = ReadAllBytes(&buffer[0], buffer.size());
+  error = ReadAllBytes(*m_conn, &buffer[0], buffer.size());
   if (error.Fail())
     return Status::FromErrorStringWithFormat("Failed to read response: %s",
                                              error.AsCString());
@@ -555,51 +557,57 @@ Status AdbClient::SyncService::internalStat(const FileSpec &remote_file,
   return Status();
 }
 
-Status AdbClient::SyncService::PullFile(const FileSpec &remote_file,
-                                        const FileSpec &local_file) {
-  return executeCommand([this, &remote_file, &local_file]() {
-    return internalPullFile(remote_file, local_file);
+Status AdbSyncService::PullFile(const FileSpec &remote_file,
+                                const FileSpec &local_file) {
+  return ExecuteCommand([this, &remote_file, &local_file]() {
+    return PullFileImpl(remote_file, local_file);
   });
 }
 
-Status AdbClient::SyncService::PushFile(const FileSpec &local_file,
-                                        const FileSpec &remote_file) {
-  return executeCommand([this, &local_file, &remote_file]() {
-    return internalPushFile(local_file, remote_file);
+Status AdbSyncService::PushFile(const FileSpec &local_file,
+                                const FileSpec &remote_file) {
+  return ExecuteCommand([this, &local_file, &remote_file]() {
+    return PushFileImpl(local_file, remote_file);
   });
 }
 
-Status AdbClient::SyncService::Stat(const FileSpec &remote_file, uint32_t &mode,
-                                    uint32_t &size, uint32_t &mtime) {
-  return executeCommand([this, &remote_file, &mode, &size, &mtime]() {
-    return internalStat(remote_file, mode, size, mtime);
+Status AdbSyncService::Stat(const FileSpec &remote_file, uint32_t &mode,
+                            uint32_t &size, uint32_t &mtime) {
+  return ExecuteCommand([this, &remote_file, &mode, &size, &mtime]() {
+    return StatImpl(remote_file, mode, size, mtime);
   });
 }
 
-bool AdbClient::SyncService::IsConnected() const {
+bool AdbSyncService::IsConnected() const {
   return m_conn && m_conn->IsConnected();
 }
 
-AdbClient::SyncService::SyncService(std::unique_ptr<Connection> &&conn)
-    : m_conn(std::move(conn)) {}
-
-Status
-AdbClient::SyncService::executeCommand(const std::function<Status()> &cmd) {
-  if (!m_conn)
-    return Status::FromErrorString("SyncService is disconnected");
+AdbSyncService::AdbSyncService(const std::string device_id)
+    : m_device_id(device_id) {
+  m_conn = std::make_unique<ConnectionFileDescriptor>();
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log,
+            "AdbSyncService::AdbSyncService() - Creating AdbSyncService for "
+            "device: %s",
+            m_device_id.c_str());
+}
 
+Status AdbSyncService::ExecuteCommand(const std::function<Status()> &cmd) {
   Status error = cmd();
-  if (error.Fail())
-    m_conn.reset();
-
   return error;
 }
 
-AdbClient::SyncService::~SyncService() = default;
+AdbSyncService::~AdbSyncService() {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOGF(log,
+            "AdbSyncService::~AdbSyncService() - Destroying AdbSyncService for "
+            "device: %s",
+            m_device_id.c_str());
+}
 
-Status AdbClient::SyncService::SendSyncRequest(const char *request_id,
-                                               const uint32_t data_len,
-                                               const void *data) {
+Status AdbSyncService::SendSyncRequest(const char *request_id,
+                                       const uint32_t data_len,
+                                       const void *data) {
   DataEncoder encoder(eByteOrderLittle, sizeof(void *));
   encoder.AppendData(llvm::StringRef(request_id));
   encoder.AppendU32(data_len);
@@ -615,11 +623,11 @@ Status AdbClient::SyncService::SendSyncRequest(const char *request_id,
   return error;
 }
 
-Status AdbClient::SyncService::ReadSyncHeader(std::string &response_id,
-                                              uint32_t &data_len) {
+Status AdbSyncService::ReadSyncHeader(std::string &response_id,
+                                      uint32_t &data_len) {
   char buffer[kSyncPacketLen];
 
-  auto error = ReadAllBytes(buffer, kSyncPacketLen);
+  auto error = ReadAllBytes(*m_conn, buffer, kSyncPacketLen);
   if (error.Success()) {
     response_id.assign(&buffer[0], 4);
     DataExtractor extractor(&buffer[4], 4, eByteOrderLittle, sizeof(void *));
@@ -630,8 +638,7 @@ Status AdbClient::SyncService::ReadSyncHeader(std::string &response_id,
   return error;
 }
 
-Status AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
-                                             bool &eof) {
+Status AdbSyncService::PullFileChunk(std::vector<char> &buffer, bool &eof) {
   buffer.clear();
 
   std::string response_id;
@@ -642,14 +649,14 @@ Status AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
 
   if (response_id == kDATA) {
     buffer.resize(data_len, 0);
-    error = ReadAllBytes(&buffer[0], data_len);
+    error = ReadAllBytes(*m_conn, &buffer[0], data_len);
     if (error.Fail())
       buffer.clear();
   } else if (response_id == kDONE) {
     eof = true;
   } else if (response_id == kFAIL) {
     std::string error_message(data_len, 0);
-    error = ReadAllBytes(&error_message[0], data_len);
+    error = ReadAllBytes(*m_conn, &error_message[0], data_len);
     if (error.Fail())
       return Status::FromErrorStringWithFormat(
           "Failed to read pull error message: %s", error.AsCString());
@@ -662,6 +669,15 @@ Status AdbClient::SyncService::PullFileChunk(std::vector<char> &buffer,
   return Status();
 }
 
-Status AdbClient::SyncService::ReadAllBytes(void *buffer, size_t size) {
-  return ::ReadAllBytes(*m_conn, buffer, size);
+Status AdbSyncService::SetupSyncConnection() {
+  Status error = ConnectToAdb(*m_conn);
+  if (error.Fail())
+    return error;
+
+  error = SelectTargetDevice(*m_conn, m_device_id);
+  if (error.Fail())
+    return error;
+
+  error = EnterSyncMode(*m_conn);
+  return error;
 }
diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.h b/lldb/source/Plugins/Platform/Android/AdbClient.h
index 851c099..341a9fa 100644
--- a/lldb/source/Plugins/Platform/Android/AdbClient.h
+++ b/lldb/source/Plugins/Platform/Android/AdbClient.h
@@ -10,6 +10,7 @@
 #define LLDB_SOURCE_PLUGINS_PLATFORM_ANDROID_ADBCLIENT_H
 
 #include "lldb/Utility/Status.h"
+#include "llvm/Support/Error.h"
 #include <chrono>
 #include <functional>
 #include <list>
@@ -32,59 +33,21 @@ public:
 
   using DeviceIDList = std::list<std::string>;
 
-  class SyncService {
-    friend class AdbClient;
-
-  public:
-    virtual ~SyncService();
-
-    virtual Status PullFile(const FileSpec &remote_file,
-                            const FileSpec &local_file);
-
-    Status PushFile(const FileSpec &local_file, const FileSpec &remote_file);
-
-    virtual Status Stat(const FileSpec &remote_file, uint32_t &mode,
-                        uint32_t &size, uint32_t &mtime);
-
-    bool IsConnected() const;
-
-  protected:
-    explicit SyncService(std::unique_ptr<Connection> &&conn);
-
-  private:
-    Status SendSyncRequest(const char *request_id, const uint32_t data_len,
-                           const void *data);
-
-    Status ReadSyncHeader(std::string &response_id, uint32_t &data_len);
-
-    Status PullFileChunk(std::vector<char> &buffer, bool &eof);
-
-    Status ReadAllBytes(void *buffer, size_t size);
-
-    Status internalPullFile(const FileSpec &remote_file,
-                            const FileSpec &local_file);
-
-    Status internalPushFile(const FileSpec &local_file,
-                            const FileSpec &remote_file);
-
-    Status internalStat(const FileSpec &remote_file, uint32_t &mode,
-                        uint32_t &size, uint32_t &mtime);
-
-    Status executeCommand(const std::function<Status()> &cmd);
-
-    std::unique_ptr<Connection> m_conn;
-  };
-
-  static Status CreateByDeviceID(const std::string &device_id, AdbClient &adb);
+  /// Resolves a device identifier to its canonical form.
+  ///
+  /// \param device_id the device identifier to resolve (may be empty).
+  ///
+  /// \returns Expected<std::string> containing the resolved device ID on
+  ///          success, or an Error if the device ID cannot be resolved or
+  ///          is ambiguous.
+  static llvm::Expected<std::string> ResolveDeviceID(llvm::StringRef device_id);
 
   AdbClient();
-  explicit AdbClient(const std::string &device_id);
+  explicit AdbClient(llvm::StringRef device_id);
 
   virtual ~AdbClient();
 
-  const std::string &GetDeviceID() const;
-
-  Status GetDevices(DeviceIDList &device_list);
+  llvm::StringRef GetDeviceID() const;
 
   Status SetPortForwarding(const uint16_t local_port,
                            const uint16_t remote_port);
@@ -102,39 +65,50 @@ public:
                              std::chrono::milliseconds timeout,
                              const FileSpec &output_file_spec);
 
-  virtual std::unique_ptr<SyncService> GetSyncService(Status &error);
-
-  Status SwitchDeviceTransport();
-
-private:
   Status Connect();
 
-  void SetDeviceID(const std::string &device_id);
-
-  Status SendMessage(const std::string &packet, const bool reconnect = true);
-
-  Status SendDeviceMessage(const std::string &packet);
-
-  Status ReadMessage(std::vector<char> &message);
+private:
+  Status SendDeviceMessage(llvm::StringRef packet);
 
   Status ReadMessageStream(std::vector<char> &message,
                            std::chrono::milliseconds timeout);
 
-  Status GetResponseError(const char *response_id);
+  Status internalShell(const char *command, std::chrono::milliseconds timeout,
+                       std::vector<char> &output_buf);
 
-  Status ReadResponseStatus();
+  std::string m_device_id;
+  std::unique_ptr<Connection> m_conn;
+};
 
-  Status Sync();
+class AdbSyncService {
+public:
+  explicit AdbSyncService(const std::string device_id);
+  virtual ~AdbSyncService();
+  Status SetupSyncConnection();
 
-  Status StartSync();
+  virtual Status PullFile(const FileSpec &remote_file,
+                          const FileSpec &local_file);
+  virtual Status PushFile(const FileSpec &local_file,
+                          const FileSpec &remote_file);
+  virtual Status Stat(const FileSpec &remote_file, uint32_t &mode,
+                      uint32_t &size, uint32_t &mtime);
+  virtual bool IsConnected() const;
 
-  Status internalShell(const char *command, std::chrono::milliseconds timeout,
-                       std::vector<char> &output_buf);
+  llvm::StringRef GetDeviceId() const { return m_device_id; }
 
-  Status ReadAllBytes(void *buffer, size_t size);
+private:
+  Status SendSyncRequest(const char *request_id, const uint32_t data_len,
+                         const void *data);
+  Status ReadSyncHeader(std::string &response_id, uint32_t &data_len);
+  Status PullFileChunk(std::vector<char> &buffer, bool &eof);
+  Status PullFileImpl(const FileSpec &remote_file, const FileSpec &local_file);
+  Status PushFileImpl(const FileSpec &local_file, const FileSpec &remote_file);
+  Status StatImpl(const FileSpec &remote_file, uint32_t &mode, uint32_t &size,
+                  uint32_t &mtime);
+  Status ExecuteCommand(const std::function<Status()> &cmd);
 
-  std::string m_device_id;
   std::unique_ptr<Connection> m_conn;
+  std::string m_device_id;
 };
 
 } // namespace platform_android
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
index 5bc9cc133f..600cc0a 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -9,10 +9,8 @@
 #include "lldb/Core/Module.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Core/Section.h"
-#include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
-#include "lldb/Utility/Scalar.h"
 #include "lldb/Utility/UriParser.h"
 #include "lldb/ValueObject/ValueObject.h"
 
@@ -194,12 +192,10 @@ Status PlatformAndroid::ConnectRemote(Args &args) {
 
   auto error = PlatformLinux::ConnectRemote(args);
   if (error.Success()) {
-    AdbClient adb;
-    error = AdbClient::CreateByDeviceID(m_device_id, adb);
-    if (error.Fail())
-      return error;
-
-    m_device_id = adb.GetDeviceID();
+    auto resolved_device_id_or_error = AdbClient::ResolveDeviceID(m_device_id);
+    if (!resolved_device_id_or_error)
+      return Status::FromError(resolved_device_id_or_error.takeError());
+    m_device_id = *resolved_device_id_or_error;
   }
   return error;
 }
@@ -216,29 +212,33 @@ Status PlatformAndroid::GetFile(const FileSpec &source,
 
   Status error;
   auto sync_service = GetSyncService(error);
-  if (error.Fail())
-    return error;
-
-  uint32_t mode = 0, size = 0, mtime = 0;
-  error = sync_service->Stat(source_spec, mode, size, mtime);
-  if (error.Fail())
-    return error;
 
-  if (mode != 0)
-    return sync_service->PullFile(source_spec, destination);
+  // If sync service is available, try to use it
+  if (error.Success() && sync_service) {
+    uint32_t mode = 0, size = 0, mtime = 0;
+    error = sync_service->Stat(source_spec, mode, size, mtime);
+    if (error.Success()) {
+      if (mode != 0)
+        return sync_service->PullFile(source_spec, destination);
+
+      // mode == 0 can signify that adbd cannot access the file due security
+      // constraints - fall through to try "cat ..." as a fallback.
+      Log *log = GetLog(LLDBLog::Platform);
+      LLDB_LOGF(log, "Got mode == 0 on '%s': try to get file via 'shell cat'",
+                source_spec.GetPath(false).c_str());
+    }
+  }
 
+  // Fallback to shell cat command if sync service failed or returned mode == 0
   std::string source_file = source_spec.GetPath(false);
 
   Log *log = GetLog(LLDBLog::Platform);
-  LLDB_LOGF(log, "Got mode == 0 on '%s': try to get file via 'shell cat'",
-            source_file.c_str());
+  LLDB_LOGF(log, "Using shell cat fallback for '%s'", source_file.c_str());
 
   if (strchr(source_file.c_str(), '\'') != nullptr)
     return Status::FromErrorString(
         "Doesn't support single-quotes in filenames");
 
-  // mode == 0 can signify that adbd cannot access the file due security
-  // constraints - try "cat ..." as a fallback.
   AdbClientUP adb(GetAdbClient(error));
   if (error.Fail())
     return error;
@@ -275,12 +275,19 @@ Status PlatformAndroid::DownloadModuleSlice(const FileSpec &src_file_spec,
                                             const uint64_t src_offset,
                                             const uint64_t src_size,
                                             const FileSpec &dst_file_spec) {
+  std::string source_file = src_file_spec.GetPath(false);
+  if (source_file.empty())
+    return Status::FromErrorString("Source file path cannot be empty");
+
+  std::string destination_file = dst_file_spec.GetPath(false);
+  if (destination_file.empty())
+    return Status::FromErrorString("Destination file path cannot be empty");
+
   // In Android API level 23 and above, dynamic loader is able to load .so
   // file directly from APK. In that case, src_offset will be an non-zero.
   if (src_offset == 0) // Use GetFile for a normal file.
     return GetFile(src_file_spec, dst_file_spec);
 
-  std::string source_file = src_file_spec.GetPath(false);
   if (source_file.find('\'') != std::string::npos)
     return Status::FromErrorString(
         "Doesn't support single-quotes in filenames");
@@ -424,7 +431,7 @@ PlatformAndroid::GetLibdlFunctionDeclarations(lldb_private::Process *process) {
   std::vector<const char *> dl_open_names = {"__dl_dlopen", "dlopen"};
   const char *dl_open_name = nullptr;
   Target &target = process->GetTarget();
-  for (auto name : dl_open_names) {
+  for (auto *name : dl_open_names) {
     target.GetImages().FindFunctionSymbols(
         ConstString(name), eFunctionNameTypeFull, matching_symbols);
     if (matching_symbols.GetSize()) {
@@ -445,11 +452,8 @@ PlatformAndroid::GetLibdlFunctionDeclarations(lldb_private::Process *process) {
 }
 
 PlatformAndroid::AdbClientUP PlatformAndroid::GetAdbClient(Status &error) {
-  AdbClientUP adb(std::make_unique<AdbClient>(m_device_id));
-  if (adb)
-    error.Clear();
-  else
-    error = Status::FromErrorString("Failed to create AdbClient");
+  AdbClientUP adb = std::make_unique<AdbClient>(m_device_id);
+  error = adb->Connect();
   return adb;
 }
 
@@ -473,14 +477,10 @@ std::string PlatformAndroid::GetRunAs() {
   }
   return run_as.str();
 }
-
-AdbClient::SyncService *PlatformAndroid::GetSyncService(Status &error) {
-  if (m_adb_sync_svc && m_adb_sync_svc->IsConnected())
-    return m_adb_sync_svc.get();
-
-  AdbClientUP adb(GetAdbClient(error));
+std::unique_ptr<AdbSyncService> PlatformAndroid::GetSyncService(Status &error) {
+  auto sync_service = std::make_unique<AdbSyncService>(m_device_id);
+  error = sync_service->SetupSyncConnection();
   if (error.Fail())
     return nullptr;
-  m_adb_sync_svc = adb->GetSyncService(error);
-  return (error.Success()) ? m_adb_sync_svc.get() : nullptr;
+  return sync_service;
 }
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
index 5602edf..3384525 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
@@ -75,14 +75,15 @@ protected:
   typedef std::unique_ptr<AdbClient> AdbClientUP;
   virtual AdbClientUP GetAdbClient(Status &error);
 
+  std::string GetRunAs();
+
+public:
   virtual llvm::StringRef GetPropertyPackageName();
 
-  std::string GetRunAs();
+protected:
+  virtual std::unique_ptr<AdbSyncService> GetSyncService(Status &error);
 
 private:
-  AdbClient::SyncService *GetSyncService(Status &error);
-
-  std::unique_ptr<AdbClient::SyncService> m_adb_sync_svc;
   std::string m_device_id;
   uint32_t m_sdk_version;
 };
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
index 0cf6480..461ee8e3 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
@@ -21,6 +21,7 @@
 using namespace lldb;
 using namespace lldb_private;
 using namespace platform_android;
+using namespace llvm;
 
 static const lldb::pid_t g_remote_platform_pid =
     0; // Alias for the process id of lldb-platform
@@ -32,12 +33,12 @@ static Status ForwardPortWithAdb(
     std::string &device_id) {
   Log *log = GetLog(LLDBLog::Platform);
 
-  AdbClient adb;
-  auto error = AdbClient::CreateByDeviceID(device_id, adb);
-  if (error.Fail())
-    return error;
+  auto resolved_device_id_or_error = AdbClient::ResolveDeviceID(device_id);
+  if (!resolved_device_id_or_error)
+    return Status::FromError(resolved_device_id_or_error.takeError());
+  device_id = *resolved_device_id_or_error;
 
-  device_id = adb.GetDeviceID();
+  AdbClient adb(device_id);
   LLDB_LOGF(log, "Connected to Android device \"%s\"", device_id.c_str());
 
   if (remote_port != 0) {
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index 986d647..112eb06 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -644,8 +644,14 @@ SymbolFileNativePDB::CreateClassStructUnion(PdbTypeSymId type_id,
 
   std::string uname = GetUnqualifiedTypeName(record);
 
-  // FIXME: Search IPI stream for LF_UDT_MOD_SRC_LINE.
+  llvm::Expected<Declaration> maybeDecl = ResolveUdtDeclaration(type_id);
   Declaration decl;
+  if (maybeDecl)
+    decl = std::move(*maybeDecl);
+  else
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Symbols), maybeDecl.takeError(),
+                   "Failed to resolve declaration for '{1}': {0}", uname);
+
   return MakeType(toOpaqueUid(type_id), ConstString(uname), size, nullptr,
                   LLDB_INVALID_UID, Type::eEncodingIsUID, decl, ct,
                   Type::ResolveState::Forward);
@@ -668,7 +674,14 @@ lldb::TypeSP SymbolFileNativePDB::CreateTagType(PdbTypeSymId type_id,
                                                 CompilerType ct) {
   std::string uname = GetUnqualifiedTypeName(er);
 
+  llvm::Expected<Declaration> maybeDecl = ResolveUdtDeclaration(type_id);
   Declaration decl;
+  if (maybeDecl)
+    decl = std::move(*maybeDecl);
+  else
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Symbols), maybeDecl.takeError(),
+                   "Failed to resolve declaration for '{1}': {0}", uname);
+
   TypeSP underlying_type = GetOrCreateType(er.UnderlyingType);
 
   return MakeType(
@@ -1675,7 +1688,7 @@ void SymbolFileNativePDB::CacheFunctionNames() {
         llvm::cantFail(SymbolDeserializer::deserializeAs<ProcSym>(*iter));
     if ((proc.Flags & ProcSymFlags::IsUnreachable) != ProcSymFlags::None)
       continue;
-    if (proc.Name.empty())
+    if (proc.Name.empty() || proc.FunctionType.isSimple())
       continue;
 
     // The function/procedure symbol only contains the demangled name.
@@ -2556,3 +2569,70 @@ SymbolFileNativePDB::GetContextForType(TypeIndex ti) {
   }
   return ctx;
 }
+
+void SymbolFileNativePDB::CacheUdtDeclarations() {
+  for (CVType cvt : m_index->ipi().typeArray()) {
+    switch (cvt.kind()) {
+    case LF_UDT_SRC_LINE: {
+      UdtSourceLineRecord udt_src;
+      llvm::cantFail(TypeDeserializer::deserializeAs(cvt, udt_src));
+      m_udt_declarations.try_emplace(
+          udt_src.UDT, UdtDeclaration{/*FileNameIndex=*/udt_src.SourceFile,
+                                      /*IsIpiIndex=*/true,
+                                      /*Line=*/udt_src.LineNumber});
+    } break;
+    case LF_UDT_MOD_SRC_LINE: {
+      UdtModSourceLineRecord udt_mod_src;
+      llvm::cantFail(TypeDeserializer::deserializeAs(cvt, udt_mod_src));
+      // Some types might be contributed by multiple modules. We assume that
+      // they all point to the same file and line because we can only provide
+      // one location.
+      m_udt_declarations.try_emplace(
+          udt_mod_src.UDT,
+          UdtDeclaration{/*FileNameIndex=*/udt_mod_src.SourceFile,
+                         /*IsIpiIndex=*/false,
+                         /*Line=*/udt_mod_src.LineNumber});
+    } break;
+    default:
+      break;
+    }
+  }
+}
+
+llvm::Expected<Declaration>
+SymbolFileNativePDB::ResolveUdtDeclaration(PdbTypeSymId type_id) {
+  std::call_once(m_cached_udt_declarations, [this] { CacheUdtDeclarations(); });
+
+  auto it = m_udt_declarations.find(type_id.index);
+  if (it == m_udt_declarations.end())
+    return llvm::createStringError("No UDT declaration found");
+
+  llvm::StringRef file_name;
+  if (it->second.IsIpiIndex) {
+    CVType cvt = m_index->ipi().getType(it->second.FileNameIndex);
+    if (cvt.kind() != LF_STRING_ID)
+      return llvm::createStringError("File name was not a LF_STRING_ID");
+
+    StringIdRecord sid;
+    llvm::cantFail(TypeDeserializer::deserializeAs(cvt, sid));
+    file_name = sid.String;
+  } else {
+    // The file name index is an index into the string table
+    auto string_table = m_index->pdb().getStringTable();
+    if (!string_table)
+      return string_table.takeError();
+
+    llvm::Expected<llvm::StringRef> string =
+        string_table->getStringTable().getString(
+            it->second.FileNameIndex.getIndex());
+    if (!string)
+      return string.takeError();
+    file_name = *string;
+  }
+
+  // rustc sets the filename to "<unknown>" for some files
+  if (file_name == "\\<unknown>")
+    return Declaration();
+
+  return Declaration(FileSpec(file_name), it->second.Line);
+}
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
index 6bbeb8b..cfa0041 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.h
@@ -262,6 +262,9 @@ private:
 
   void CacheFunctionNames();
 
+  void CacheUdtDeclarations();
+  llvm::Expected<Declaration> ResolveUdtDeclaration(PdbTypeSymId type_id);
+
   llvm::BumpPtrAllocator m_allocator;
 
   lldb::addr_t m_obj_load_address = 0;
@@ -283,6 +286,18 @@ private:
   llvm::DenseMap<llvm::codeview::TypeIndex, llvm::codeview::TypeIndex>
       m_parent_types;
 
+  struct UdtDeclaration {
+    /// This could either be an index into the `/names` section (string table,
+    /// LF_UDT_MOD_SRC_LINE) or, this could be an index into the IPI stream to a
+    /// LF_STRING_ID record (LF_UDT_SRC_LINE).
+    llvm::codeview::TypeIndex FileNameIndex;
+    bool IsIpiIndex;
+
+    uint32_t Line;
+  };
+  llvm::DenseMap<llvm::codeview::TypeIndex, UdtDeclaration> m_udt_declarations;
+  std::once_flag m_cached_udt_declarations;
+
   lldb_private::UniqueCStringMap<uint32_t> m_type_base_names;
 
   /// mangled name/full function name -> Global ID(s)
diff --git a/lldb/test/API/commands/expression/TestRegisterExpressionEndian.py b/lldb/test/API/commands/expression/TestRegisterExpressionEndian.py
index 66e38df..d6de873 100644
--- a/lldb/test/API/commands/expression/TestRegisterExpressionEndian.py
+++ b/lldb/test/API/commands/expression/TestRegisterExpressionEndian.py
@@ -40,9 +40,15 @@ class Responder(MockGDBServerResponder):
 
 class TestXMLRegisterFlags(GDBRemoteTestBase):
     def do_endian_test(self, endian):
-        architecture, pc_reg_name = {
-            Endian.BIG: ("s390x", "pswa"),
-            Endian.LITTLE: ("aarch64", "pc"),
+        architecture, pc_reg_name, yaml_file, data, machine = {
+            Endian.BIG: ("s390x", "pswa", "s390x.yaml", "ELFDATA2MSB", "EM_S390"),
+            Endian.LITTLE: (
+                "aarch64",
+                "pc",
+                "aarch64.yaml",
+                "ELFDATA2LSB",
+                "EM_AARCH64",
+            ),
         }[endian]
 
         self.server.responder = Responder(
@@ -58,14 +64,35 @@ class TestXMLRegisterFlags(GDBRemoteTestBase):
             ),
             endian,
         )
-        target = self.dbg.CreateTarget("")
+
+        # We need to have a program file, so that we have a full type system,
+        # so that we can do the casts later.
+        obj_path = self.getBuildArtifact("main.o")
+        yaml_path = self.getBuildArtifact(yaml_file)
+        with open(yaml_path, "w") as f:
+            f.write(
+                dedent(
+                    f"""\
+                --- !ELF
+                FileHeader:
+                  Class:    ELFCLASS64
+                  Data:     {data}
+                  Type:     ET_REL
+                  Machine:  {machine}
+                ...
+                """
+                )
+            )
+        self.yaml2obj(yaml_path, obj_path)
+        target = self.dbg.CreateTarget(obj_path)
+
         process = self.connect(target)
         lldbutil.expect_state_changes(
             self, self.dbg.GetListener(), process, [lldb.eStateStopped]
         )
 
         # If expressions convert register values into target endian, the
-        # result of register read and expr should be the same.
+        # result of register read, expr and casts should be the same.
         pc_value = "0x0000000000001234"
         self.expect(
             "register read pc",
@@ -73,14 +100,29 @@ class TestXMLRegisterFlags(GDBRemoteTestBase):
         )
         self.expect("expr --format hex -- $pc", substrs=[pc_value])
 
+        pc = (
+            process.thread[0]
+            .frame[0]
+            .GetRegisters()
+            .GetValueAtIndex(0)
+            .GetChildMemberWithName("pc")
+        )
+        ull = target.FindTypes("unsigned long long").GetTypeAtIndex(0)
+        pc_ull = pc.Cast(ull)
+
+        self.assertEqual(pc.GetValue(), pc_ull.GetValue())
+        self.assertEqual(pc.GetValueAsAddress(), pc_ull.GetValueAsAddress())
+        self.assertEqual(pc.GetValueAsSigned(), pc_ull.GetValueAsSigned())
+        self.assertEqual(pc.GetValueAsUnsigned(), pc_ull.GetValueAsUnsigned())
+
     @skipIfXmlSupportMissing
     @skipIfRemote
+    @skipIfLLVMTargetMissing("AArch64")
     def test_little_endian_target(self):
         self.do_endian_test(Endian.LITTLE)
 
     @skipIfXmlSupportMissing
     @skipIfRemote
-    # Unlike AArch64, we do need the backend present for this test to work.
     @skipIfLLVMTargetMissing("SystemZ")
     def test_big_endian_target(self):
         self.do_endian_test(Endian.BIG)
diff --git a/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py
index dcceca6..102f2b0 100644
--- a/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py
+++ b/lldb/test/API/python_api/find_in_memory/address_ranges_helper.py
@@ -55,27 +55,34 @@ def GetRangeFromAddrValue(test_base, addr, shrink=False):
     return lldb.SBAddressRange(start, size)
 
 
-def IsWithinRange(addr, size, range, target):
-    start_addr = range.GetBaseAddress().GetLoadAddress(target)
-    end_addr = start_addr + range.GetByteSize()
-    addr = addr.GetValueAsUnsigned()
-    return addr >= start_addr and addr + size <= end_addr
-
-
 def GetHeapRanges(test_base, shrink=False):
     frame = test_base.thread.GetSelectedFrame()
 
     ex = frame.EvaluateExpression("heap_pointer1")
     test_base.assertTrue(ex.IsValid())
-    range = GetRangeFromAddrValue(test_base, ex, shrink)
-    addr_ranges = lldb.SBAddressRangeList()
-    addr_ranges.Append(range)
+    range1 = GetRangeFromAddrValue(test_base, ex, shrink)
+    range1_start = range1.GetBaseAddress().GetLoadAddress(test_base.target)
+    range1_end = range1_start + range1.GetByteSize()
 
     ex = frame.EvaluateExpression("heap_pointer2")
     test_base.assertTrue(ex.IsValid())
-    size = len(DOUBLE_INSTANCE_PATTERN_HEAP)
-    if not IsWithinRange(ex, size, addr_ranges[0], test_base.target):
-        addr_ranges.Append(GetRangeFromAddrValue(test_base, ex, shrink))
+    range2 = GetRangeFromAddrValue(test_base, ex, shrink)
+    range2_start = range2.GetBaseAddress().GetLoadAddress(test_base.target)
+    range2_end = range2_start + range2.GetByteSize()
+
+    addr_ranges = lldb.SBAddressRangeList()
+
+    if range1_end < range2_start or range2_end < range1_start:
+        # The ranges do not overlap; add them both.
+        addr_ranges.Append(range1)
+        addr_ranges.Append(range2)
+    else:
+        # Merge overlapping ranges.
+        base = min(range1_start, range2_start)
+        end = max(range1_end, range2_end)
+        start = lldb.SBAddress(base, test_base.target)
+        size = end - base
+        addr_ranges.Append(lldb.SBAddressRange(start, size))
 
     return addr_ranges
 
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/unknown-udt-decl.ll b/lldb/test/Shell/SymbolFile/NativePDB/unknown-udt-decl.ll
new file mode 100644
index 0000000..af78789
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/NativePDB/unknown-udt-decl.ll
@@ -0,0 +1,56 @@
+; Test that the declaration for UDTs won't be "<unknown>" or "\<unknown>".
+; Rustc sets the location of some builtin types to this string.
+
+; REQUIRES: system-windows
+; RUN: %build --compiler=clang-cl --nodefaultlib -o %t.exe -- %s
+; RUN: lldb-test symbols %t.exe | FileCheck %s
+
+; there shouldn't be a declaration (would be between size and compiler_type)
+; CHECK: Type{{.*}} , name = "Foo", size = 1, compiler_type = {{.*}} struct Foo {
+
+; This is edited output from clang  simulates rustc behavior (see !17)
+; Source:
+; struct Foo {};
+;
+; int main() { Foo f; }
+
+
+; ModuleID = 'main.cpp'
+source_filename = "main.cpp"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.44.35207"
+
+%struct.Foo = type { i8 }
+
+; Function Attrs: mustprogress noinline norecurse nounwind optnone uwtable
+define dso_local noundef i32 @main() #0 !dbg !9 {
+  %1 = alloca %struct.Foo, align 1
+    #dbg_declare(ptr %1, !14, !DIExpression(), !16)
+  ret i32 0, !dbg !16
+}
+
+attributes #0 = { mustprogress noinline norecurse nounwind optnone uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.1.6", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "main.cpp", directory: "F:\\Dev\\rust-dbg-test", checksumkind: CSK_MD5, checksum: "b8942260dadf9ec35328889f05afb954")
+!2 = !{i32 2, !"CodeView", i32 1}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 2}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 2}
+!7 = !{i32 1, !"MaxTLSAlign", i32 65536}
+!8 = !{!"clang version 20.1.6"}
+!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !13)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{}
+!14 = !DILocalVariable(name: "f", scope: !9, file: !1, line: 3, type: !15)
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo", file: !17, line: 1, size: 8, flags: DIFlagTypePassByValue, elements: !13, identifier: ".?AUFoo@@")
+!16 = !DILocation(line: 3, scope: !9)
+; This is how rustc emits some types
+!17 = !DIFile(filename: "<unknown>", directory: "")
diff --git a/lldb/test/Shell/SymbolFile/PDB/class-layout.test b/lldb/test/Shell/SymbolFile/PDB/class-layout.test
index e9a7d1c..eca910e 100644
--- a/lldb/test/Shell/SymbolFile/PDB/class-layout.test
+++ b/lldb/test/Shell/SymbolFile/PDB/class-layout.test
@@ -12,9 +12,19 @@ RUN: lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix
 RUN: lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=BASE %s
 RUN: lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=FRIEND %s
 RUN: lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=CLASS %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=UNION %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=STRUCT %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=COMPLEX %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=LIST %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=UNNAMED-STRUCT %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=BASE %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=FRIEND %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/ClassLayoutTest.cpp.exe | FileCheck --check-prefix=CLASS %s
 
 CHECK: Module [[MOD:.*]]
-CHECK: SymbolFile pdb ([[MOD]])
+CHECK: SymbolFile {{(native-)?}}pdb ([[MOD]])
 CHECK: {{^[0-9A-F]+}}:   CompileUnit{{[{]0x[0-9a-f]+[}]}}, language = "c++", file = '{{.*}}\ClassLayoutTest.cpp'
 
 ENUM:  name = "Enum", size = 4,  decl = ClassLayoutTest.cpp:5
diff --git a/lldb/test/Shell/SymbolFile/PDB/enums-layout.test b/lldb/test/Shell/SymbolFile/PDB/enums-layout.test
index 6f861c6d..9766d6f 100644
--- a/lldb/test/Shell/SymbolFile/PDB/enums-layout.test
+++ b/lldb/test/Shell/SymbolFile/PDB/enums-layout.test
@@ -7,6 +7,12 @@ RUN: lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-
 RUN: lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=UCHAR-ENUM %s
 RUN: lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=CLASS-ENUM %s
 RUN: lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=STRUCT-ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=CONST-ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=EMPTY-ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=UCHAR-ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=CLASS-ENUM %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/SimpleTypesTest.cpp.enums.exe | FileCheck --check-prefix=STRUCT-ENUM %s
 
 ; FIXME: PDB does not have information about scoped enumeration (Enum class) so the  
 ; compiler type used is the same as the one for unscoped enumeration.
diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
index cb6da6b..4e94582 100644
--- a/lldb/unittests/Host/JSONTransportTest.cpp
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -132,6 +132,9 @@ public:
 
 } // namespace
 
+// Failing on Windows, see https://github.com/llvm/llvm-project/issues/153446.
+#ifndef _WIN32
+
 TEST_F(HTTPDelimitedJSONTransportTest, MalformedRequests) {
   std::string malformed_header =
       "COnTent-LenGth: -1\r\nContent-Type: text/json\r\n\r\nnotjosn";
@@ -336,3 +339,5 @@ TEST_F(JSONRPCTransportTest, NoDataTimeout) {
       RunOnce<JSONTestType>(/*timeout=*/std::chrono::milliseconds(10)),
       FailedWithMessage("timeout"));
 }
+
+#endif
diff --git a/lldb/unittests/Platform/Android/AdbClientTest.cpp b/lldb/unittests/Platform/Android/AdbClientTest.cpp
index 0808b96..719b7ca 100644
--- a/lldb/unittests/Platform/Android/AdbClientTest.cpp
+++ b/lldb/unittests/Platform/Android/AdbClientTest.cpp
@@ -6,8 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "Plugins/Platform/Android/AdbClient.h"
+#include "lldb/Host/Socket.h"
+#include "lldb/Host/common/TCPSocket.h"
+#include "gtest/gtest.h"
+#include <chrono>
 #include <cstdlib>
 
 static void set_env(const char *var, const char *value) {
@@ -20,32 +23,117 @@ static void set_env(const char *var, const char *value) {
 
 using namespace lldb;
 using namespace lldb_private;
-
-namespace lldb_private {
-namespace platform_android {
+using namespace lldb_private::platform_android;
 
 class AdbClientTest : public ::testing::Test {
 public:
-  void SetUp() override { set_env("ANDROID_SERIAL", ""); }
+  void SetUp() override {
+    set_env("ANDROID_SERIAL", "");
+    set_env("ANDROID_ADB_SERVER_PORT", "");
+  }
 
-  void TearDown() override { set_env("ANDROID_SERIAL", ""); }
+  void TearDown() override {
+    set_env("ANDROID_SERIAL", "");
+    set_env("ANDROID_ADB_SERVER_PORT", "");
+  }
 };
 
-TEST(AdbClientTest, CreateByDeviceId) {
-  AdbClient adb;
-  Status error = AdbClient::CreateByDeviceID("device1", adb);
-  EXPECT_TRUE(error.Success());
-  EXPECT_EQ("device1", adb.GetDeviceID());
+TEST_F(AdbClientTest, ResolveDeviceId_ExplicitDeviceId) {
+  auto result = AdbClient::ResolveDeviceID("device1");
+  EXPECT_TRUE(static_cast<bool>(result));
+  EXPECT_EQ("device1", *result);
 }
 
-TEST(AdbClientTest, CreateByDeviceId_ByEnvVar) {
+TEST_F(AdbClientTest, ResolveDeviceId_ByEnvVar) {
   set_env("ANDROID_SERIAL", "device2");
 
-  AdbClient adb;
-  Status error = AdbClient::CreateByDeviceID("", adb);
-  EXPECT_TRUE(error.Success());
-  EXPECT_EQ("device2", adb.GetDeviceID());
+  auto result = AdbClient::ResolveDeviceID("");
+  EXPECT_TRUE(static_cast<bool>(result));
+  EXPECT_EQ("device2", *result);
+}
+
+TEST_F(AdbClientTest, ResolveDeviceId_PrefersExplicitOverEnvVar) {
+  set_env("ANDROID_SERIAL", "env_device");
+
+  // Explicit device ID should take precedence over environment variable
+  auto result = AdbClient::ResolveDeviceID("explicit_device");
+  EXPECT_TRUE(static_cast<bool>(result));
+  EXPECT_EQ("explicit_device", *result);
+}
+
+TEST_F(AdbClientTest, AdbClient_Constructor_StoresDeviceId) {
+  AdbClient client("test_device_123");
+  EXPECT_EQ(client.GetDeviceID(), "test_device_123");
+}
+
+TEST_F(AdbClientTest, AdbClient_DefaultConstructor) {
+  AdbClient client;
+  EXPECT_EQ(client.GetDeviceID(), "");
 }
 
-} // end namespace platform_android
-} // end namespace lldb_private
+TEST_F(AdbClientTest, AdbSyncService_Constructor_StoresDeviceId) {
+  AdbSyncService sync("device123");
+  EXPECT_EQ(sync.GetDeviceId(), "device123");
+}
+
+TEST_F(AdbClientTest, AdbSyncService_OperationsFailWhenNotConnected) {
+  AdbSyncService sync_service("test_device");
+
+  // Verify service is not connected initially
+  EXPECT_FALSE(sync_service.IsConnected());
+
+  // File operations should fail when not connected
+  FileSpec remote_file("/data/test.txt");
+  FileSpec local_file("/tmp/test.txt");
+  uint32_t mode, size, mtime;
+
+  Status stat_result = sync_service.Stat(remote_file, mode, size, mtime);
+  EXPECT_TRUE(stat_result.Fail());
+
+  Status pull_result = sync_service.PullFile(remote_file, local_file);
+  EXPECT_TRUE(pull_result.Fail());
+
+  Status push_result = sync_service.PushFile(local_file, remote_file);
+  EXPECT_TRUE(push_result.Fail());
+}
+
+static uint16_t FindUnusedPort() {
+  auto temp_socket = std::make_unique<TCPSocket>(true);
+  Status error = temp_socket->Listen("localhost:0", 1);
+  if (error.Fail()) {
+    return 0; // fallback
+  }
+  uint16_t port = temp_socket->GetLocalPortNumber();
+  temp_socket.reset(); // Close the socket to free the port
+  return port;
+}
+
+TEST_F(AdbClientTest, RealTcpConnection) {
+  uint16_t unused_port = FindUnusedPort();
+  ASSERT_NE(unused_port, 0) << "Failed to find an unused port";
+
+  std::string port_str = std::to_string(unused_port);
+  setenv("ANDROID_ADB_SERVER_PORT", port_str.c_str(), 1);
+
+  AdbClient client;
+  const auto status1 = client.Connect();
+  EXPECT_FALSE(status1.Success())
+      << "Connection should fail when no server is listening on port "
+      << unused_port;
+
+  // now start a server on the port and try again
+  auto listen_socket = std::make_unique<TCPSocket>(true);
+  std::string listen_address = "localhost:" + port_str;
+  Status error = listen_socket->Listen(listen_address.c_str(), 5);
+  ASSERT_TRUE(error.Success()) << "Failed to create listening socket on port "
+                               << unused_port << ": " << error.AsCString();
+
+  // Verify the socket is listening on the expected port
+  ASSERT_EQ(listen_socket->GetLocalPortNumber(), unused_port)
+      << "Socket is not listening on the expected port";
+
+  const auto status2 = client.Connect();
+  EXPECT_TRUE(status2.Success())
+      << "Connection should succeed when server is listening on port "
+      << unused_port;
+}
diff --git a/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp b/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp
index d021562..514bce1 100644
--- a/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp
+++ b/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp
@@ -8,8 +8,6 @@
 
 #include "Plugins/Platform/Android/PlatformAndroid.h"
 #include "Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h"
-#include "TestingSupport/SubsystemRAII.h"
-#include "TestingSupport/TestUtilities.h"
 #include "lldb/Utility/Connection.h"
 #include "gmock/gmock.h"
 
@@ -20,212 +18,281 @@ using namespace testing;
 
 namespace {
 
-class MockSyncService : public AdbClient::SyncService {
-public:
-  MockSyncService() : SyncService(std::unique_ptr<Connection>()) {}
-
-  MOCK_METHOD2(PullFile,
-               Status(const FileSpec &remote_file, const FileSpec &local_file));
-  MOCK_METHOD4(Stat, Status(const FileSpec &remote_file, uint32_t &mode,
-                            uint32_t &size, uint32_t &mtime));
-};
-
-typedef std::unique_ptr<AdbClient::SyncService> SyncServiceUP;
-
 class MockAdbClient : public AdbClient {
 public:
-  explicit MockAdbClient() : AdbClient("mock") {}
+  explicit MockAdbClient() : AdbClient() {}
 
   MOCK_METHOD3(ShellToFile,
                Status(const char *command, std::chrono::milliseconds timeout,
                       const FileSpec &output_file_spec));
-  MOCK_METHOD1(GetSyncService, SyncServiceUP(Status &error));
 };
 
 class PlatformAndroidTest : public PlatformAndroid, public ::testing::Test {
 public:
   PlatformAndroidTest() : PlatformAndroid(false) {
     m_remote_platform_sp = PlatformSP(new PlatformAndroidRemoteGDBServer());
+
+    // Set up default mock behavior to avoid uninteresting call warnings
+    ON_CALL(*this, GetSyncService(_))
+        .WillByDefault([](Status &error) -> std::unique_ptr<AdbSyncService> {
+          error = Status::FromErrorString("Sync service unavailable");
+          return nullptr;
+        });
   }
 
   MOCK_METHOD1(GetAdbClient, AdbClientUP(Status &error));
   MOCK_METHOD0(GetPropertyPackageName, llvm::StringRef());
+  MOCK_METHOD1(GetSyncService, std::unique_ptr<AdbSyncService>(Status &error));
+
+  // Make GetSyncService public for testing
+  using PlatformAndroid::GetSyncService;
 };
 
 } // namespace
 
-TEST_F(PlatformAndroidTest, DownloadModuleSliceWithAdbClientError) {
+TEST_F(PlatformAndroidTest,
+       DownloadModuleSlice_AdbClientError_FailsGracefully) {
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(1)
       .WillOnce(DoAll(WithArg<0>([](auto &arg) {
                         arg = Status::FromErrorString(
                             "Failed to create AdbClient");
                       }),
                       Return(ByMove(AdbClientUP()))));
 
-  EXPECT_TRUE(
-      DownloadModuleSlice(
-          FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
-          3600, FileSpec())
-          .Fail());
-}
-
-TEST_F(PlatformAndroidTest, DownloadModuleSliceWithNormalFile) {
-  auto sync_service = new MockSyncService();
-  EXPECT_CALL(*sync_service, Stat(FileSpec("/system/lib64/libc.so"), _, _, _))
-      .Times(1)
-      .WillOnce(DoAll(SetArgReferee<1>(1), Return(Status())));
-  EXPECT_CALL(*sync_service, PullFile(FileSpec("/system/lib64/libc.so"), _))
-      .Times(1)
-      .WillOnce(Return(Status()));
-
-  auto adb_client = new MockAdbClient();
-  EXPECT_CALL(*adb_client, GetSyncService(_))
-      .Times(1)
-      .WillOnce(Return(ByMove(SyncServiceUP(sync_service))));
-
-  EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(1)
-      .WillOnce(Return(ByMove(AdbClientUP(adb_client))));
+  Status result = DownloadModuleSlice(
+      FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
+      3600, FileSpec("/tmp/libtest.so"));
 
-  EXPECT_TRUE(
-      DownloadModuleSlice(FileSpec("/system/lib64/libc.so"), 0, 0, FileSpec())
-          .Success());
+  EXPECT_TRUE(result.Fail());
+  EXPECT_THAT(result.AsCString(), HasSubstr("Failed to create AdbClient"));
 }
 
-TEST_F(PlatformAndroidTest, DownloadModuleSliceWithZipFile) {
-  auto adb_client = new MockAdbClient();
+TEST_F(PlatformAndroidTest, DownloadModuleSlice_ZipFile_UsesCorrectDdCommand) {
+  auto *adb_client = new MockAdbClient();
   EXPECT_CALL(*adb_client,
               ShellToFile(StrEq("dd if='/system/app/Test/Test.apk' "
                                 "iflag=skip_bytes,count_bytes "
                                 "skip=4096 count=3600 status=none"),
                           _, _))
-      .Times(1)
       .WillOnce(Return(Status()));
 
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("")));
+
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(1)
       .WillOnce(Return(ByMove(AdbClientUP(adb_client))));
 
-  EXPECT_TRUE(
-      DownloadModuleSlice(
-          FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
-          3600, FileSpec())
-          .Success());
+  Status result = DownloadModuleSlice(
+      FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
+      3600, FileSpec("/tmp/libtest.so"));
+
+  EXPECT_TRUE(result.Success());
 }
 
-TEST_F(PlatformAndroidTest, DownloadModuleSliceWithZipFileAndRunAs) {
-  auto adb_client = new MockAdbClient();
+TEST_F(PlatformAndroidTest,
+       DownloadModuleSlice_ZipFileWithRunAs_UsesRunAsCommand) {
+  auto *adb_client = new MockAdbClient();
   EXPECT_CALL(*adb_client,
               ShellToFile(StrEq("run-as 'com.example.test' "
                                 "dd if='/system/app/Test/Test.apk' "
                                 "iflag=skip_bytes,count_bytes "
                                 "skip=4096 count=3600 status=none"),
                           _, _))
-      .Times(1)
       .WillOnce(Return(Status()));
 
   EXPECT_CALL(*this, GetPropertyPackageName())
-      .Times(1)
       .WillOnce(Return(llvm::StringRef("com.example.test")));
 
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(1)
       .WillOnce(Return(ByMove(AdbClientUP(adb_client))));
 
-  EXPECT_TRUE(
-      DownloadModuleSlice(
-          FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
-          3600, FileSpec())
-          .Success());
+  Status result = DownloadModuleSlice(
+      FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096,
+      3600, FileSpec("/tmp/libtest.so"));
+
+  EXPECT_TRUE(result.Success());
 }
 
-TEST_F(PlatformAndroidTest, GetFileWithNormalFile) {
-  auto sync_service = new MockSyncService();
-  EXPECT_CALL(*sync_service, Stat(FileSpec("/data/local/tmp/test"), _, _, _))
-      .Times(1)
-      .WillOnce(DoAll(SetArgReferee<1>(1), Return(Status())));
-  EXPECT_CALL(*sync_service, PullFile(FileSpec("/data/local/tmp/test"), _))
-      .Times(1)
+TEST_F(PlatformAndroidTest,
+       DownloadModuleSlice_LargeFile_CalculatesParametersCorrectly) {
+  const uint64_t large_offset = 100 * 1024 * 1024; // 100MB offset
+  const uint64_t large_size = 50 * 1024 * 1024;    // 50MB size
+
+  auto *adb_client = new MockAdbClient();
+  EXPECT_CALL(*adb_client,
+              ShellToFile(StrEq("dd if='/system/app/Large.apk' "
+                                "iflag=skip_bytes,count_bytes "
+                                "skip=104857600 count=52428800 status=none"),
+                          _, _))
       .WillOnce(Return(Status()));
 
-  auto adb_client = new MockAdbClient();
-  EXPECT_CALL(*adb_client, GetSyncService(_))
-      .Times(1)
-      .WillOnce(Return(ByMove(SyncServiceUP(sync_service))));
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("")));
 
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(1)
       .WillOnce(Return(ByMove(AdbClientUP(adb_client))));
 
-  EXPECT_TRUE(GetFile(FileSpec("/data/local/tmp/test"), FileSpec()).Success());
+  Status result = DownloadModuleSlice(
+      FileSpec("/system/app/Large.apk!/lib/arm64-v8a/large.so"), large_offset,
+      large_size, FileSpec("/tmp/large.so"));
+
+  EXPECT_TRUE(result.Success());
 }
 
-TEST_F(PlatformAndroidTest, GetFileWithCatFallback) {
-  auto sync_service = new MockSyncService();
-  EXPECT_CALL(
-      *sync_service,
-      Stat(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), _, _, _))
-      .Times(1)
-      .WillOnce(DoAll(SetArgReferee<1>(0), Return(Status())));
+TEST_F(PlatformAndroidTest,
+       GetFile_SyncServiceUnavailable_FallsBackToShellCat) {
+  auto *adb_client = new MockAdbClient();
+  EXPECT_CALL(*adb_client,
+              ShellToFile(StrEq("cat '/data/local/tmp/test'"), _, _))
+      .WillOnce(Return(Status()));
 
-  auto adb_client0 = new MockAdbClient();
-  EXPECT_CALL(*adb_client0, GetSyncService(_))
-      .Times(1)
-      .WillOnce(Return(ByMove(SyncServiceUP(sync_service))));
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("")));
+
+  EXPECT_CALL(*this, GetAdbClient(_))
+      .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }),
+                      Return(ByMove(AdbClientUP(adb_client)))));
+
+  EXPECT_CALL(*this, GetSyncService(_))
+      .WillOnce([](Status &error) -> std::unique_ptr<AdbSyncService> {
+        error = Status::FromErrorString("Sync service unavailable");
+        return nullptr;
+      });
+
+  Status result =
+      GetFile(FileSpec("/data/local/tmp/test"), FileSpec("/tmp/test"));
+  EXPECT_TRUE(result.Success());
+}
 
-  auto adb_client1 = new MockAdbClient();
+TEST_F(PlatformAndroidTest, GetFile_WithRunAs_UsesRunAsInShellCommand) {
+  auto *adb_client = new MockAdbClient();
   EXPECT_CALL(
-      *adb_client1,
-      ShellToFile(StrEq("cat '/data/data/com.example.app/lib-main/libtest.so'"),
+      *adb_client,
+      ShellToFile(StrEq("run-as 'com.example.app' "
+                        "cat '/data/data/com.example.app/lib-main/libtest.so'"),
                   _, _))
-      .Times(1)
       .WillOnce(Return(Status()));
 
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("com.example.app")));
+
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(2)
-      .WillOnce(Return(ByMove(AdbClientUP(adb_client0))))
-      .WillOnce(Return(ByMove(AdbClientUP(adb_client1))));
+      .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }),
+                      Return(ByMove(AdbClientUP(adb_client)))));
+
+  EXPECT_CALL(*this, GetSyncService(_))
+      .WillOnce([](Status &error) -> std::unique_ptr<AdbSyncService> {
+        error = Status::FromErrorString("Sync service unavailable");
+        return nullptr;
+      });
 
-  EXPECT_TRUE(
+  Status result =
       GetFile(FileSpec("/data/data/com.example.app/lib-main/libtest.so"),
-              FileSpec())
-          .Success());
+              FileSpec("/tmp/libtest.so"));
+  EXPECT_TRUE(result.Success());
 }
 
-TEST_F(PlatformAndroidTest, GetFileWithCatFallbackAndRunAs) {
-  auto sync_service = new MockSyncService();
-  EXPECT_CALL(
-      *sync_service,
-      Stat(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), _, _, _))
-      .Times(1)
-      .WillOnce(DoAll(SetArgReferee<1>(0), Return(Status())));
+TEST_F(PlatformAndroidTest, GetFile_FilenameWithSingleQuotes_Rejected) {
+  EXPECT_CALL(*this, GetSyncService(_))
+      .WillOnce([](Status &error) -> std::unique_ptr<AdbSyncService> {
+        error = Status::FromErrorString("Sync service unavailable");
+        return nullptr;
+      });
 
-  auto adb_client0 = new MockAdbClient();
-  EXPECT_CALL(*adb_client0, GetSyncService(_))
-      .Times(1)
-      .WillOnce(Return(ByMove(SyncServiceUP(sync_service))));
+  Status result =
+      GetFile(FileSpec("/test/file'with'quotes"), FileSpec("/tmp/output"));
 
-  auto adb_client1 = new MockAdbClient();
-  EXPECT_CALL(
-      *adb_client1,
-      ShellToFile(StrEq("run-as 'com.example.app' "
-                        "cat '/data/data/com.example.app/lib-main/libtest.so'"),
-                  _, _))
-      .Times(1)
+  EXPECT_TRUE(result.Fail());
+  EXPECT_THAT(result.AsCString(), HasSubstr("single-quotes"));
+}
+
+TEST_F(PlatformAndroidTest,
+       DownloadModuleSlice_FilenameWithSingleQuotes_Rejected) {
+  Status result = DownloadModuleSlice(FileSpec("/test/file'with'quotes"), 100,
+                                      200, FileSpec("/tmp/output"));
+
+  EXPECT_TRUE(result.Fail());
+  EXPECT_THAT(result.AsCString(), HasSubstr("single-quotes"));
+}
+
+TEST_F(PlatformAndroidTest, GetFile_NetworkTimeout_PropagatesErrorCorrectly) {
+  auto *adb_client = new MockAdbClient();
+  EXPECT_CALL(*adb_client, ShellToFile(_, _, _))
+      .WillOnce(Return(Status::FromErrorString("Network timeout")));
+
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("")));
+
+  EXPECT_CALL(*this, GetAdbClient(_))
+      .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }),
+                      Return(ByMove(AdbClientUP(adb_client)))));
+
+  EXPECT_CALL(*this, GetSyncService(_))
+      .WillOnce([](Status &error) -> std::unique_ptr<AdbSyncService> {
+        error = Status::FromErrorString("Sync service unavailable");
+        return nullptr;
+      });
+
+  Status result =
+      GetFile(FileSpec("/data/large/file.so"), FileSpec("/tmp/large.so"));
+  EXPECT_TRUE(result.Fail());
+  EXPECT_THAT(result.AsCString(), HasSubstr("Network timeout"));
+}
+
+TEST_F(PlatformAndroidTest, SyncService_ConnectionFailsGracefully) {
+  // Constructor should succeed even with a failing connection
+  AdbSyncService sync_service("test-device");
+
+  // The service should report as not connected initially
+  EXPECT_FALSE(sync_service.IsConnected());
+  EXPECT_EQ(sync_service.GetDeviceId(), "test-device");
+
+  // Operations should fail gracefully when connection setup fails
+  FileSpec remote_file("/data/test.txt");
+  FileSpec local_file("/tmp/test.txt");
+  uint32_t mode, size, mtime;
+
+  Status result = sync_service.Stat(remote_file, mode, size, mtime);
+  EXPECT_TRUE(result.Fail());
+}
+
+TEST_F(PlatformAndroidTest, GetRunAs_FormatsPackageNameCorrectly) {
+  // Empty package name
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("")));
+  EXPECT_EQ(this->GetRunAs(), "");
+
+  // Valid package name
+  EXPECT_CALL(*this, GetPropertyPackageName())
+      .WillOnce(Return(llvm::StringRef("com.example.test")));
+  EXPECT_EQ(this->GetRunAs(), "run-as 'com.example.test' ");
+}
+
+TEST_F(PlatformAndroidTest,
+       DownloadModuleSlice_ZeroOffset_CallsGetFileInsteadOfDd) {
+  // When offset=0, DownloadModuleSlice calls GetFile which uses 'cat', not 'dd'
+  // We need to ensure the sync service fails so GetFile falls back to shell cat
+  auto *adb_client = new MockAdbClient();
+  EXPECT_CALL(*adb_client,
+              ShellToFile(StrEq("cat '/system/lib64/libc.so'"), _, _))
       .WillOnce(Return(Status()));
 
   EXPECT_CALL(*this, GetPropertyPackageName())
-      .Times(1)
-      .WillOnce(Return(llvm::StringRef("com.example.app")));
+      .WillOnce(Return(llvm::StringRef("")));
 
   EXPECT_CALL(*this, GetAdbClient(_))
-      .Times(2)
-      .WillOnce(Return(ByMove(AdbClientUP(adb_client0))))
-      .WillOnce(Return(ByMove(AdbClientUP(adb_client1))));
+      .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }),
+                      Return(ByMove(AdbClientUP(adb_client)))));
 
-  EXPECT_TRUE(
-      GetFile(FileSpec("/data/data/com.example.app/lib-main/libtest.so"),
-              FileSpec())
-          .Success());
+  // Mock GetSyncService to fail, forcing GetFile to use shell cat fallback
+  EXPECT_CALL(*this, GetSyncService(_))
+      .WillOnce(DoAll(WithArg<0>([](auto &arg) {
+                        arg =
+                            Status::FromErrorString("Sync service unavailable");
+                      }),
+                      Return(ByMove(std::unique_ptr<AdbSyncService>()))));
+
+  Status result = DownloadModuleSlice(FileSpec("/system/lib64/libc.so"), 0, 0,
+                                      FileSpec("/tmp/libc.so"));
+  EXPECT_TRUE(result.Success());
 }
diff --git a/llvm/cmake/modules/HandleLLVMStdlib.cmake b/llvm/cmake/modules/HandleLLVMStdlib.cmake
index a7e138a..dda1caa 100644
--- a/llvm/cmake/modules/HandleLLVMStdlib.cmake
+++ b/llvm/cmake/modules/HandleLLVMStdlib.cmake
@@ -2,6 +2,7 @@
 # if the user has requested it.
 
 include(DetermineGCCCompatible)
+include(CheckIncludeFiles)
 
 if(NOT DEFINED LLVM_STDLIB_HANDLED)
   set(LLVM_STDLIB_HANDLED ON)
@@ -19,7 +20,17 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED)
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       check_cxx_compiler_flag("-stdlib=libc++" CXX_COMPILER_SUPPORTS_STDLIB)
       check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB)
-      if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB)
+
+      # Check whether C++ include files are available
+      # runtimes/CMakeLists.txt adds -nostdlib++ and -nostdinc++ to
+      # CMAKE_REQUIRED_FLAGS, which are incompatible with -stdlib=libc++; use
+      # a fresh CMAKE_REQUIRED_FLAGS environment.
+      cmake_push_check_state(RESET)
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -stdlib=libc++")
+      check_include_files("chrono" CXX_COMPILER_SUPPORTS_STDLIB_CHRONO LANGUAGE CXX)
+      cmake_pop_check_state()
+
+      if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB AND CXX_COMPILER_SUPPORTS_STDLIB_CHRONO)
         append("-stdlib=libc++"
           CMAKE_CXX_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS
           CMAKE_MODULE_LINKER_FLAGS)
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index c15b957..c39c33f 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -55,6 +55,8 @@ endif()
 
 set(LLVM_ENABLE_RTTI @LLVM_ENABLE_RTTI@)
 
+set(LLVM_ENABLE_LIBCXX @LLVM_ENABLE_LIBCXX@)
+
 set(LLVM_ENABLE_LIBEDIT @HAVE_LIBEDIT@)
 if(LLVM_ENABLE_LIBEDIT)
   find_package(LibEdit)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 162208f..1aebcc4 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -392,7 +392,7 @@ added in the future:
     sequence in place of a call site. This convention forces the call
     arguments into registers but allows them to be dynamically
     allocated. This can currently only be used with calls to
-    llvm.experimental.patchpoint because only this intrinsic records
+    ``llvm.experimental.patchpoint`` because only this intrinsic records
     the location of its arguments in a side table. See :doc:`StackMaps`.
 "``preserve_mostcc``" - The `PreserveMost` calling convention
     This calling convention attempts to make the code in the caller as
@@ -610,7 +610,7 @@ model is not supported, or if a better choice of model can be made.
 A model can also be specified in an alias, but then it only governs how
 the alias is accessed. It will not have any effect on the aliasee.
 
-For platforms without linker support of ELF TLS model, the -femulated-tls
+For platforms without linker support of ELF TLS model, the ``-femulated-tls``
 flag can be used to generate GCC-compatible emulated TLS code.
 
 .. _runtime_preemption_model:
@@ -3641,8 +3641,8 @@ to support the somewhat common pattern in C of intentionally storing to an
 invalid pointer to crash the program. In the future, it might make sense to
 allow frontends to control this behavior.
 
-IR-level volatile loads and stores cannot safely be optimized into llvm.memcpy
-or llvm.memmove intrinsics even when those intrinsics are flagged volatile.
+IR-level volatile loads and stores cannot safely be optimized into ``llvm.memcpy``
+or ``llvm.memmove`` intrinsics even when those intrinsics are flagged volatile.
 Likewise, the backend should never split or merge target-legal volatile
 load/store instructions. Similarly, IR-level volatile loads and stores cannot
 change from integer to floating-point or vice versa.
@@ -6448,18 +6448,18 @@ descriptors <DISubrange>` or :ref:`subrange descriptors
 <DISubrangeType>`, each representing the range of subscripts at that
 level of indexing. The ``DIFlagVector`` flag to ``flags:`` indicates
 that an array type is a native packed vector. The optional
-``dataLocation`` is a DIExpression that describes how to get from an
+``dataLocation`` is a ``DIExpression`` that describes how to get from an
 object's address to the actual raw data, if they aren't
 equivalent. This is only supported for array types, particularly to
 describe Fortran arrays, which have an array descriptor in addition to
-the array data. Alternatively it can also be DIVariable which has the
+the array data. Alternatively it can also be ``DIVariable`` which has the
 address of the actual raw data. The Fortran language supports pointer
 arrays which can be attached to actual arrays, this attachment between
 pointer and pointee is called association.  The optional
-``associated`` is a DIExpression that describes whether the pointer
+``associated`` is a ``DIExpression`` that describes whether the pointer
 array is currently associated.  The optional ``allocated`` is a
-DIExpression that describes whether the allocatable array is currently
-allocated.  The optional ``rank`` is a DIExpression that describes the
+``DIExpression`` that describes whether the allocatable array is currently
+allocated.  The optional ``rank`` is a ``DIExpression`` that describes the
 rank (number of dimensions) of fortran assumed rank array (rank is
 known at runtime).  The optional ``bitStride`` is an unsigned constant
 that describes the number of bits occupied by an element of the array;
@@ -6763,7 +6763,7 @@ expression language. They are used in :ref:`debug records <debugrecords>`
 referenced LLVM variable relates to the source language variable. Debug
 expressions are interpreted left-to-right: start by pushing the value/address
 operand of the record onto a stack, then repeatedly push and evaluate
-opcodes from the DIExpression until the final variable description is produced.
+opcodes from the ``DIExpression`` until the final variable description is produced.
 
 The current supported opcode vocabulary is limited:
 
@@ -6852,7 +6852,7 @@ The current supported opcode vocabulary is limited:
 - ``DW_OP_LLVM_implicit_pointer`` It specifies the dereferenced value. It can
   be used to represent pointer variables which are optimized out but the value
   it points to is known. This operator is required as it is different than DWARF
-  operator DW_OP_implicit_pointer in representation and specification (number
+  operator ``DW_OP_implicit_pointer`` in representation and specification (number
   and types of operands) and later can not be used as multiple level.
 
 .. code-block:: text
@@ -6889,22 +6889,22 @@ in registers or in memory (see ``DW_OP_stack_value``).
 
 A ``#dbg_declare`` record describes an indirect value (the address) of a
 source variable. The first operand of the record must be an address of some
-kind. A DIExpression operand to the record refines this address to produce a
+kind. A ``DIExpression`` operand to the record refines this address to produce a
 concrete location for the source variable.
 
 A ``#dbg_value`` record describes the direct value of a source variable.
 The first operand of the record may be a direct or indirect value. A
-DIExpression operand to the record refines the first operand to produce a
+``DIExpression`` operand to the record refines the first operand to produce a
 direct value. For example, if the first operand is an indirect value, it may be
-necessary to insert ``DW_OP_deref`` into the DIExpression in order to produce a
+necessary to insert ``DW_OP_deref`` into the ``DIExpression`` in order to produce a
 valid debug record.
 
 .. note::
 
-   A DIExpression is interpreted in the same way regardless of which kind of
+   A ``DIExpression`` is interpreted in the same way regardless of which kind of
    debug record it's attached to.
 
-   DIExpressions are always printed and parsed inline; they can never be
+   ``DIExpressions`` are always printed and parsed inline; they can never be
    referenced by an ID (e.g. ``!1``).
 
 .. code-block:: text
@@ -6944,7 +6944,7 @@ DIArgList
 ``DIArgList`` nodes hold a list of constant or SSA value references. These are
 used in :ref:`debug records <debugrecords>` in combination with a
 ``DIExpression`` that uses the
-``DW_OP_LLVM_arg`` operator. Because a DIArgList may refer to local values
+``DW_OP_LLVM_arg`` operator. Because a ``DIArgList`` may refer to local values
 within a function, it must only be used as a function argument, must always be
 inlined, and cannot appear in named metadata.
 
@@ -6962,7 +6962,7 @@ These flags encode various properties of DINodes.
 
 The `ExportSymbols` flag marks a class, struct or union whose members
 may be referenced as if they were defined in the containing class or
-union. This flag is used to decide whether the DW_AT_export_symbols can
+union. This flag is used to decide whether the ``DW_AT_export_symbols`` can
 be used for the structure type.
 
 DIObjCProperty
@@ -7539,7 +7539,7 @@ sections that the user does not want removed after linking.
 
 ``unpredictable`` metadata may be attached to any branch or switch
 instruction. It can be used to express the unpredictability of control
-flow. Similar to the llvm.expect intrinsic, it may be used to alter
+flow. Similar to the ``llvm.expect`` intrinsic, it may be used to alter
 optimizations related to compare and branch instructions. The metadata
 is treated as a boolean value; if it exists, it signals that the branch
 or switch that it is attached to is completely unpredictable.
@@ -7977,7 +7977,7 @@ performed on this loop. The metadata has a single operand which is the string
 
    !0 = !{!"llvm.licm.disable"}
 
-Note that although it operates per loop it isn't given the llvm.loop prefix
+Note that although it operates per loop it isn't given the ``llvm.loop`` prefix
 as it is not affected by the ``llvm.loop.disable_nonforced`` metadata.
 
 '``llvm.access.group``' Metadata
@@ -8174,8 +8174,8 @@ Examples:
 
    !0 = !{}
 
-The invariant.group metadata must be dropped when replacing one pointer by
-another based on aliasing information. This is because invariant.group is tied
+The ``invariant.group`` metadata must be dropped when replacing one pointer by
+another based on aliasing information. This is because ``invariant.group`` is tied
 to the SSA value of the pointer operand.
 
 .. code-block:: llvm
@@ -8275,9 +8275,9 @@ value profile information. Currently this is indirect calls (where it
 records the hottest callees) and calls to memory intrinsics such as memcpy,
 memmove, and memset (where it records the hottest byte lengths).
 
-Each VP metadata node contains "VP" string, then a uint32_t value for the value
-profiling kind, a uint64_t value for the total number of times the instruction
-is executed, followed by uint64_t value and execution count pairs.
+Each VP metadata node contains "VP" string, then a ``uint32_t`` value for the value
+profiling kind, a ``uint64_t`` value for the total number of times the instruction
+is executed, followed by ``uint64_t`` value and execution count pairs.
 The value profiling kind is 0 for indirect call targets and 1 for memory
 operations. For indirect call targets, each profile value is a hash
 of the callee function name, and for memory operations each value is the
@@ -15744,7 +15744,7 @@ external functions.
 Syntax:
 """""""
 
-This is an overloaded intrinsic. You can use llvm.memmove on any integer
+This is an overloaded intrinsic. You can use ``llvm.memmove`` on any integer
 bit width and for different address space. Not all targets support all
 bit widths however.
 
@@ -15805,7 +15805,7 @@ otherwise the behavior is undefined.
 Syntax:
 """""""
 
-This is an overloaded intrinsic. You can use llvm.memset on any integer
+This is an overloaded intrinsic. You can use ``llvm.memset`` on any integer
 bit width and for different address spaces. However, not all targets
 support all bit widths.
 
@@ -17994,7 +17994,7 @@ operate on a per-element basis and the element order is not affected.
 Syntax:
 """""""
 
-This is an overloaded intrinsic. You can use llvm.ctpop on any integer
+This is an overloaded intrinsic. You can use ``llvm.ctpop`` on any integer
 bit width, or on any vector with integer elements. Not all targets
 support all bit widths or vector types, however.
 
@@ -26414,7 +26414,7 @@ This is an overloaded intrinsic.
 Overview:
 """""""""
 
-Predicated llvm.is.fpclass :ref:`llvm.is.fpclass <llvm.is.fpclass>`
+Predicated ``llvm.is.fpclass`` :ref:`llvm.is.fpclass <llvm.is.fpclass>`
 
 Arguments:
 """"""""""
@@ -26429,7 +26429,7 @@ operation.
 Semantics:
 """"""""""
 
-The '``llvm.vp.is.fpclass``' intrinsic performs llvm.is.fpclass (:ref:`llvm.is.fpclass <llvm.is.fpclass>`).
+The '``llvm.vp.is.fpclass``' intrinsic performs ``llvm.is.fpclass`` (:ref:`llvm.is.fpclass <llvm.is.fpclass>`).
 
 
 Examples:
@@ -28493,7 +28493,7 @@ environment.  The rounding mode argument is only intended as information
 to the compiler.
 
 If the runtime floating-point environment is using the default rounding mode
-then the results will be the same as the llvm.lrint intrinsic.
+then the results will be the same as the ``llvm.lrint`` intrinsic.
 
 
 '``llvm.experimental.constrained.llrint``' Intrinsic
@@ -28541,7 +28541,7 @@ environment.  The rounding mode argument is only intended as information
 to the compiler.
 
 If the runtime floating-point environment is using the default rounding mode
-then the results will be the same as the llvm.llrint intrinsic.
+then the results will be the same as the ``llvm.llrint intrinsic``.
 
 
 '``llvm.experimental.constrained.nearbyint``' Intrinsic
@@ -30457,7 +30457,7 @@ has externally observable side effects.
 Syntax:
 """""""
 
-This is an overloaded intrinsic. You can use llvm.is.constant with any argument type.
+This is an overloaded intrinsic. You can use ``llvm.is.constant`` with any argument type.
 
 ::
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 9879d0d..d02cf98 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2695,6 +2695,14 @@ LLVM_C_ABI void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind,
                                       LLVMMetadataRef MD);
 
 /**
+ * Adds a metadata attachment.
+ *
+ * @see llvm::GlobalObject::addMetadata()
+ */
+LLVM_C_ABI void LLVMGlobalAddMetadata(LLVMValueRef Global, unsigned Kind,
+                                      LLVMMetadataRef MD);
+
+/**
  * Erases a metadata attachment of the given kind if it exists.
  *
  * @see llvm::GlobalObject::eraseMetadata()
@@ -2709,6 +2717,14 @@ LLVM_C_ABI void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind);
 LLVM_C_ABI void LLVMGlobalClearMetadata(LLVMValueRef Global);
 
 /**
+ * Add debuginfo metadata to this global.
+ *
+ * @see llvm::GlobalVariable::addDebugInfo()
+ */
+LLVM_C_ABI void LLVMGlobalAddDebugInfo(LLVMValueRef Global,
+                                       LLVMMetadataRef GVE);
+
+/**
  * Retrieves an array of metadata entries representing the metadata attached to
  * this value. The caller is responsible for freeing this array by calling
  * \c LLVMDisposeValueMetadataEntries.
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 2955063..4d60ad4 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -2294,6 +2294,12 @@ LLVM_ABI APInt mulhs(const APInt &C1, const APInt &C2);
 /// Returns the high N bits of the multiplication result.
 LLVM_ABI APInt mulhu(const APInt &C1, const APInt &C2);
 
+/// Performs (2*N)-bit multiplication on sign-extended operands.
+LLVM_ABI APInt mulsExtended(const APInt &C1, const APInt &C2);
+
+/// Performs (2*N)-bit multiplication on zero-extended operands.
+LLVM_ABI APInt muluExtended(const APInt &C1, const APInt &C2);
+
 /// Compute X^N for N>=0.
 /// 0^0 is supported and returns 1.
 LLVM_ABI APInt pow(const APInt &X, int64_t N);
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 1f27213..2a35c3d 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/EpochTracker.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
@@ -811,10 +812,12 @@ public:
     this->insert(I, E);
   }
 
-  DenseMap(std::initializer_list<typename BaseT::value_type> Vals) {
-    init(Vals.size());
-    this->insert(Vals.begin(), Vals.end());
-  }
+  template <typename RangeT>
+  DenseMap(llvm::from_range_t, const RangeT &Range)
+      : DenseMap(adl_begin(Range), adl_end(Range)) {}
+
+  DenseMap(std::initializer_list<typename BaseT::value_type> Vals)
+      : DenseMap(Vals.begin(), Vals.end()) {}
 
   ~DenseMap() {
     this->destroyAll();
@@ -985,6 +988,10 @@ public:
     this->insert(I, E);
   }
 
+  template <typename RangeT>
+  SmallDenseMap(llvm::from_range_t, const RangeT &Range)
+      : SmallDenseMap(adl_begin(Range), adl_end(Range)) {}
+
   SmallDenseMap(std::initializer_list<typename BaseT::value_type> Vals)
       : SmallDenseMap(Vals.begin(), Vals.end()) {}
 
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 28a217a..73ec7c6 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -129,7 +129,7 @@ public:
     // We must Grow -- find the size where we'd be 75% full, then round up to
     // the next power of two.
     size_type NewSize = NumEntries + (NumEntries / 3);
-    NewSize = 1 << Log2_32_Ceil(NewSize);
+    NewSize = llvm::bit_ceil(NewSize);
     // Like insert_imp_big, always allocate at least 128 elements.
     NewSize = std::max(128u, NewSize);
     Grow(NewSize);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 53c91bf..9186419 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1681,8 +1681,8 @@ public:
   /// was used in order to get the Ptr step value. \p Ptr holds the SCEV of the
   /// access pointer.
   LLVM_ABI InstructionCost
-  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE = nullptr,
-                            const SCEV *Ptr = nullptr) const;
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const;
 
   /// \returns The cost, if any, of keeping values of the given types alive
   /// over a callsite.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index e879712..200cbaf 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -939,7 +939,8 @@ public:
 
   virtual InstructionCost getAddressComputationCost(Type *PtrTy,
                                                     ScalarEvolution *,
-                                                    const SCEV *) const {
+                                                    const SCEV *,
+                                                    TTI::TargetCostKind) const {
     return 0;
   }
 
diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h
index 362cc30..98b5257 100644
--- a/llvm/include/llvm/CodeGen/Analysis.h
+++ b/llvm/include/llvm/CodeGen/Analysis.h
@@ -55,6 +55,13 @@ inline unsigned ComputeLinearIndex(Type *Ty,
   return ComputeLinearIndex(Ty, Indices.begin(), Indices.end(), CurIndex);
 }
 
+/// Given an LLVM IR type, compute non-aggregate subtypes. Optionally also
+/// compute their offsets.
+void ComputeValueTypes(const DataLayout &DL, Type *Ty,
+                       SmallVectorImpl<Type *> &Types,
+                       SmallVectorImpl<TypeSize> *Offsets = nullptr,
+                       TypeSize StartingOffset = TypeSize::getZero());
+
 /// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
 /// EVTs that represent all the individual underlying
 /// non-aggregate types that comprise it.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1216433..aa9d1f0 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -3026,8 +3026,9 @@ public:
     return LT.first.getValue();
   }
 
-  InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *,
-                                            const SCEV *) const override {
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *,
+                            TTI::TargetCostKind) const override {
     return 0;
   }
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index be90250..c468f2f 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -1098,9 +1098,9 @@ struct ConstantInt_match {
                                       BindVal ? *BindVal : Discard);
   }
 };
-/// Match any interger constants or splat of an integer constant.
+/// Match any integer constants or splat of an integer constant.
 inline ConstantInt_match m_ConstInt() { return ConstantInt_match(nullptr); }
-/// Match any interger constants or splat of an integer constant; return the
+/// Match any integer constants or splat of an integer constant; return the
 /// specific constant or constant splat value.
 inline ConstantInt_match m_ConstInt(APInt &V) { return ConstantInt_match(&V); }
 
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index aa8af69..f197c7f 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -203,8 +203,14 @@ namespace ISD {
   ///
   struct InputArg {
     ArgFlagsTy Flags;
+    /// Legalized type of this argument part.
     MVT VT = MVT::Other;
+    /// Usually the non-legalized type of the argument, which is the EVT
+    /// corresponding to the OrigTy IR type. However, for post-legalization
+    /// libcalls, this will be a legalized type.
     EVT ArgVT;
+    /// Original IR type of the argument. For aggregates, this is the type of
+    /// an individual aggregate element, not the whole aggregate.
     Type *OrigTy;
     bool Used;
 
@@ -239,8 +245,13 @@ namespace ISD {
   ///
   struct OutputArg {
     ArgFlagsTy Flags;
+    // Legalized type of this argument part.
     MVT VT;
+    /// Non-legalized type of the argument. This is the EVT corresponding to
+    /// the OrigTy IR type.
     EVT ArgVT;
+    /// Original IR type of the argument. For aggregates, this is the type of
+    /// an individual aggregate element, not the whole aggregate.
     Type *OrigTy;
 
     /// Index original Function's argument.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d31a733..20e4dfa 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3560,12 +3560,6 @@ public:
     return Libcalls.getLibcallImplName(Call);
   }
 
-  /// Check if this is valid libcall for the current module, otherwise
-  /// RTLIB::Unsupported.
-  RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const {
-    return Libcalls.getSupportedLibcallImpl(FuncName);
-  }
-
   const char *getMemcpyName() const { return Libcalls.getMemcpyName(); }
 
   /// Get the comparison predicate that's to be used to test the result of the
@@ -4894,11 +4888,10 @@ public:
       return *this;
     }
 
-    MakeLibCallOptions &setTypeListBeforeSoften(ArrayRef<EVT> OpsVT, EVT RetVT,
-                                                bool Value = true) {
+    MakeLibCallOptions &setTypeListBeforeSoften(ArrayRef<EVT> OpsVT, EVT RetVT) {
       OpsVTBeforeSoften = OpsVT;
       RetVTBeforeSoften = RetVT;
-      IsSoften = Value;
+      IsSoften = true;
       return *this;
     }
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h b/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
index 0316589..567a58e 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
@@ -153,12 +153,20 @@ private:
 using GVPredicate = std::function<bool(const GlobalValue &)>;
 using GVModifier = std::function<void(GlobalValue &)>;
 
-/// Clones teh given module onto the given context.
+/// Clones the given module onto the given context.
 LLVM_ABI ThreadSafeModule
 cloneToContext(const ThreadSafeModule &TSMW, ThreadSafeContext TSCtx,
                GVPredicate ShouldCloneDef = GVPredicate(),
                GVModifier UpdateClonedDefSource = GVModifier());
 
+/// Clone the given module onto the given context.
+/// The caller is responsible for ensuring that the source module and its
+/// LLVMContext will not be concurrently accessed during the clone.
+LLVM_ABI ThreadSafeModule
+cloneExternalModuleToContext(const Module &M, ThreadSafeContext TSCtx,
+                             GVPredicate ShouldCloneDef = GVPredicate(),
+                             GVModifier UpdateClonedDefSource = GVModifier());
+
 /// Clones the given module on to a new context.
 LLVM_ABI ThreadSafeModule cloneToNewContext(
     const ThreadSafeModule &TSMW, GVPredicate ShouldCloneDef = GVPredicate(),
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index f39e2e3..2d1d07c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -182,6 +182,14 @@ private:
     return true;
   }
 
+  static bool hasAEABILibcalls(const Triple &TT) {
+    return TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
+           TT.isTargetMuslAEABI() || TT.isAndroid();
+  }
+
+  LLVM_READONLY
+  static bool isAAPCS_ABI(const Triple &TT, StringRef ABIName);
+
   static bool darwinHasExp10(const Triple &TT);
 
   /// Return true if the target has sincosf/sincos/sincosl functions
@@ -195,8 +203,8 @@ private:
   }
 
   /// Generated by tablegen.
-  void setTargetRuntimeLibcallSets(const Triple &TT,
-                                   FloatABI::ABIType FloatABI);
+  void setTargetRuntimeLibcallSets(const Triple &TT, FloatABI::ABIType FloatABI,
+                                   EABI ABIType, StringRef ABIName);
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 9259543..9072a0a 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -406,17 +406,6 @@ multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(NAME),
 def SC_MEMCPY : RuntimeLibcall;
 def SC_MEMMOVE : RuntimeLibcall;
 def SC_MEMSET : RuntimeLibcall;
-def SC_MEMCHR: RuntimeLibcall;
-
-// AArch64 SME ABI calls
-def SMEABI_SME_STATE : RuntimeLibcall;
-def SMEABI_TPIDR2_SAVE : RuntimeLibcall;
-def SMEABI_ZA_DISABLE : RuntimeLibcall;
-def SMEABI_TPIDR2_RESTORE : RuntimeLibcall;
-def SMEABI_GET_CURRENT_VG : RuntimeLibcall;
-def SMEABI_SME_STATE_SIZE : RuntimeLibcall;
-def SMEABI_SME_SAVE : RuntimeLibcall;
-def SMEABI_SME_RESTORE : RuntimeLibcall;
 
 // ARM EABI calls
 def AEABI_MEMCPY4 : RuntimeLibcall; // Align 4
@@ -1234,35 +1223,8 @@ defset list<RuntimeLibcallImpl> AArch64LibcallImpls = {
   def __arm_sc_memcpy : RuntimeLibcallImpl<SC_MEMCPY>;
   def __arm_sc_memmove : RuntimeLibcallImpl<SC_MEMMOVE>;
   def __arm_sc_memset : RuntimeLibcallImpl<SC_MEMSET>;
-  def __arm_sc_memchr : RuntimeLibcallImpl<SC_MEMCHR>;
 } // End AArch64LibcallImpls
 
-def __arm_sme_state : RuntimeLibcallImpl<SMEABI_SME_STATE>;
-def __arm_tpidr2_save : RuntimeLibcallImpl<SMEABI_TPIDR2_SAVE>;
-def __arm_za_disable : RuntimeLibcallImpl<SMEABI_ZA_DISABLE>;
-def __arm_tpidr2_restore : RuntimeLibcallImpl<SMEABI_TPIDR2_RESTORE>;
-def __arm_get_current_vg : RuntimeLibcallImpl<SMEABI_GET_CURRENT_VG>;
-def __arm_sme_state_size : RuntimeLibcallImpl<SMEABI_SME_STATE_SIZE>;
-def __arm_sme_save : RuntimeLibcallImpl<SMEABI_SME_SAVE>;
-def __arm_sme_restore : RuntimeLibcallImpl<SMEABI_SME_RESTORE>;
-
-def SMEABI_LibCalls_PreserveMost_From_X0 : LibcallsWithCC<(add
-  __arm_tpidr2_save,
-  __arm_za_disable,
-  __arm_tpidr2_restore),
-  SMEABI_PreserveMost_From_X0>;
-
-def SMEABI_LibCalls_PreserveMost_From_X1 : LibcallsWithCC<(add
-  __arm_get_current_vg,
-  __arm_sme_state_size,
-  __arm_sme_save,
-  __arm_sme_restore),
-  SMEABI_PreserveMost_From_X1>;
-
-def SMEABI_LibCalls_PreserveMost_From_X2 : LibcallsWithCC<(add
-  __arm_sme_state),
-  SMEABI_PreserveMost_From_X2>;
-
 def isAArch64_ExceptArm64EC
     : RuntimeLibcallPredicate<"(TT.isAArch64() && !TT.isWindowsArm64EC())">;
 def isWindowsArm64EC : RuntimeLibcallPredicate<"TT.isWindowsArm64EC()">;
@@ -1282,10 +1244,7 @@ def AArch64SystemLibrary : SystemRuntimeLibrary<
        LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF128,
        DefaultLibmExp10,
        DefaultStackProtector,
-       SecurityCheckCookieIfWinMSVC,
-       SMEABI_LibCalls_PreserveMost_From_X0,
-       SMEABI_LibCalls_PreserveMost_From_X1,
-       SMEABI_LibCalls_PreserveMost_From_X2)
+       SecurityCheckCookieIfWinMSVC)
 >;
 
 // Prepend a # to every name
@@ -1321,6 +1280,9 @@ def AMDGPUSystemLibrary : SystemRuntimeLibrary<isAMDGPU, (add)>;
 // ARM Runtime Libcalls
 //===----------------------------------------------------------------------===//
 
+def isTargetAEABIAndAAPCS_ABI : RuntimeLibcallPredicate<
+  [{TT.isTargetAEABI() && isAAPCS_ABI(TT, ABIName)}]>;
+
 // if (isTargetMachO()) {
 // if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
 //  Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
@@ -1514,13 +1476,33 @@ def __udivmodsi4 : RuntimeLibcallImpl<UDIVREM_I32>;
 // a __gnu_ prefix (which is the default).
 // isTargetAEABI()
 def __aeabi_f2h : RuntimeLibcallImpl<FPROUND_F32_F16>; // CallingConv::ARM_AAPCS
-//def __aeabi_d2h : RuntimeLibcallImpl<FPROUND_F64_F16>; // CallingConv::ARM_AAPCS
 def __aeabi_h2f : RuntimeLibcallImpl<FPEXT_F16_F32>; // CallingConv::ARM_AAPCS
 
 // !isTargetMachO()
 def __gnu_f2h_ieee : RuntimeLibcallImpl<FPROUND_F32_F16>;
 def __gnu_h2f_ieee : RuntimeLibcallImpl<FPEXT_F16_F32>;
 
+// In EABI, these functions have an __aeabi_ prefix, but in GNUEABI
+// they have a __gnu_ prefix (which is the default).
+def EABIHalfConvertCalls : LibcallImpls<(add __aeabi_f2h, __aeabi_h2f),
+                                        isTargetAEABIAndAAPCS_ABI> {
+  let CallingConv = ARM_AAPCS;
+}
+
+// The half <-> float conversion functions are always soft-float on
+// non-watchos platforms, but are needed for some targets which use a
+// hard-float calling convention by default.
+def ARMHalfConvertLibcallCallingConv : LibcallCallingConv<
+  [{TT.isWatchABI() ? DefaultCC :
+    (isAAPCS_ABI(TT, ABIName) ? CallingConv::ARM_AAPCS : CallingConv::ARM_APCS)}]
+>;
+
+def GNUEABIHalfConvertCalls :
+  LibcallImpls<(add __gnu_f2h_ieee, __gnu_h2f_ieee),
+    RuntimeLibcallPredicate<[{!TT.isOSBinFormatMachO() &&
+                              !TT.isTargetAEABI()}]>> {
+  let CallingConv = ARMHalfConvertLibcallCallingConv;
+}
 
 def WindowARMDivRemCalls : LibcallImpls<
   (add __rt_sdiv, __rt_sdiv64, __rt_udiv, __rt_udiv64),
@@ -1539,8 +1521,101 @@ def WindowARMFPIntCasts : LibcallImpls<
 def AEABIDivRemCalls : LibcallImpls<
   (add __aeabi_idivmod, __aeabi_ldivmod,
        __aeabi_uidivmod, __aeabi_uldivmod),
-  RuntimeLibcallPredicate<[{TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
-    TT.isTargetMuslAEABI()}]>> {
+  RuntimeLibcallPredicate<[{hasAEABILibcalls(TT)}]>> {
+  let CallingConv = ARM_AAPCS;
+}
+
+def AEABICalls : LibcallImpls<
+  (add
+      // Double-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 2
+      __aeabi_dadd,
+      __aeabi_ddiv,
+      __aeabi_dmul,
+      __aeabi_dsub,
+
+      // Double-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 3
+      __aeabi_dcmpeq__oeq,
+      __aeabi_dcmpeq__une,
+      __aeabi_dcmplt,
+      __aeabi_dcmple,
+      __aeabi_dcmpge,
+      __aeabi_dcmpgt,
+       __aeabi_dcmpun,
+
+      // Single-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 4
+      __aeabi_fadd,
+      __aeabi_fdiv,
+      __aeabi_fmul,
+      __aeabi_fsub,
+
+      // Single-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 5
+      __aeabi_fcmpeq__oeq,
+      __aeabi_fcmpeq__une,
+      __aeabi_fcmplt,
+      __aeabi_fcmple,
+      __aeabi_fcmpge,
+      __aeabi_fcmpgt,
+      __aeabi_fcmpun,
+
+      // Floating-point to integer conversions.
+      // RTABI chapter 4.1.2, Table 6
+      __aeabi_d2iz,
+      __aeabi_d2uiz,
+      __aeabi_d2lz,
+      __aeabi_d2ulz,
+      __aeabi_f2iz,
+      __aeabi_f2uiz,
+      __aeabi_f2lz,
+      __aeabi_f2ulz,
+
+      // Conversions between floating types.
+      // RTABI chapter 4.1.2, Table 7
+      __aeabi_d2f,
+      __aeabi_f2d,
+      //__aeabi_h2f added separately
+      //__aeabi_f2h added separately
+      __aeabi_d2h,
+
+      // Integer to floating-point conversions.
+      // RTABI chapter 4.1.2, Table 8
+      __aeabi_i2d,
+      __aeabi_ui2d,
+      __aeabi_l2d,
+      __aeabi_ul2d,
+      __aeabi_i2f,
+      __aeabi_ui2f,
+      __aeabi_l2f,
+      __aeabi_ul2f,
+
+      // Long long helper functions
+      // RTABI chapter 4.2, Table 9
+      __aeabi_lmul,
+      __aeabi_llsl,
+      __aeabi_llsr,
+      __aeabi_lasr,
+
+      // Integer division functions
+      // RTABI chapter 4.3.1
+      __aeabi_idiv,
+      __aeabi_uidiv),
+  RuntimeLibcallPredicate<[{hasAEABILibcalls(TT) && isAAPCS_ABI(TT, ABIName)}]>> {
+  let CallingConv = ARM_AAPCS;
+}
+
+// EABI dependent RTLIB, Memory operations
+// RTABI chapter 4.3.4
+def AEABI45MemCalls : LibcallImpls<
+  (add __aeabi_memcpy,  __aeabi_memcpy4, __aeabi_memcpy8,
+       __aeabi_memmove, __aeabi_memmove4, __aeabi_memmove8,
+       __aeabi_memset, __aeabi_memset4, __aeabi_memset8,
+       __aeabi_memclr, __aeabi_memclr4, __aeabi_memclr8),
+  RuntimeLibcallPredicate<[{(EABIVersion == EABI::EABI4 ||
+                             EABIVersion == EABI::EABI5) &&
+                             hasAEABILibcalls(TT) && isAAPCS_ABI(TT, ABIName)}]>> {
   let CallingConv = ARM_AAPCS;
 }
 
@@ -1560,6 +1635,11 @@ def ARMSystemLibrary
            LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF128,
            DefaultLibmExp10,
 
+           AEABICalls,
+           AEABI45MemCalls,
+           EABIHalfConvertCalls,
+           GNUEABIHalfConvertCalls,
+
            // Use divmod compiler-rt calls for iOS 5.0 and later.
            LibcallImpls<(add __divmodsi4, __udivmodsi4),
                         RuntimeLibcallPredicate<[{TT.isOSBinFormatMachO() &&
diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
index b5752c1..601c291 100644
--- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
+++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
@@ -36,9 +36,6 @@ def ARM_AAPCS : LibcallCallingConv<[{CallingConv::ARM_AAPCS}]>;
 def ARM_AAPCS_VFP : LibcallCallingConv<[{CallingConv::ARM_AAPCS_VFP}]>;
 def X86_STDCALL : LibcallCallingConv<[{CallingConv::X86_StdCall}]>;
 def AVR_BUILTIN : LibcallCallingConv<[{CallingConv::AVR_BUILTIN}]>;
-def SMEABI_PreserveMost_From_X0 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0}]>;
-def SMEABI_PreserveMost_From_X1 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1}]>;
-def SMEABI_PreserveMost_From_X2 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2}]>;
 
 /// Abstract definition for functionality the compiler may need to
 /// emit a call to. Emits the RTLIB::Libcall enum - This enum defines
diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h
index 4c683aa..3cdbf84 100644
--- a/llvm/include/llvm/MCA/Instruction.h
+++ b/llvm/include/llvm/MCA/Instruction.h
@@ -382,6 +382,10 @@ public:
   bool isReadZero() const { return IsZero; }
   void setReadZero() { IsZero = true; }
   void setPRF(unsigned ID) { PRFID = ID; }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
 };
 
 /// A sequence of cycles.
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index f536024..bab1963 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -526,24 +526,27 @@ private:
   // so it doesn't use a StringSet for function names.
   StringSet<> VTableNames;
   // A map from MD5 keys to function name strings.
-  std::vector<std::pair<uint64_t, StringRef>> MD5NameMap;
+  mutable std::vector<std::pair<uint64_t, StringRef>> MD5NameMap;
   // A map from MD5 keys to function define. We only populate this map
   // when build the Symtab from a Module.
-  std::vector<std::pair<uint64_t, Function *>> MD5FuncMap;
+  mutable std::vector<std::pair<uint64_t, Function *>> MD5FuncMap;
   // A map from MD5 to the global variable. This map is only populated when
   // building the symtab from a module. Use separate container instances for
   // `MD5FuncMap` and `MD5VTableMap`.
   // TODO: Unify the container type and the lambda function 'mapName' inside
   // add{Func,VTable}WithName.
-  DenseMap<uint64_t, GlobalVariable *> MD5VTableMap;
+  mutable DenseMap<uint64_t, GlobalVariable *> MD5VTableMap;
   // A map from function runtime address to function name MD5 hash.
   // This map is only populated and used by raw instr profile reader.
-  AddrHashMap AddrToMD5Map;
+  mutable AddrHashMap AddrToMD5Map;
 
   AddrIntervalMap::Allocator VTableAddrMapAllocator;
   // This map is only populated and used by raw instr profile reader.
   AddrIntervalMap VTableAddrMap;
-  bool Sorted = false;
+
+  // "dirty" flag for the rest of the mutable state. lookup APIs (like
+  // getFunction) need the mutable state to be sorted.
+  mutable bool Sorted = false;
 
   static StringRef getExternalSymbol() { return "** External Symbol **"; }
 
@@ -565,8 +568,10 @@ private:
   // If the symtab is created by a series of calls to \c addFuncName, \c
   // finalizeSymtab needs to be called before looking up function names.
   // This is required because the underlying map is a vector (for space
-  // efficiency) which needs to be sorted.
-  inline void finalizeSymtab();
+  // efficiency) which needs to be sorted. The API is `const` because it's part
+  // of the implementation detail of `const` APIs that need to first ensure this
+  // property of ordering on the other mutable state.
+  inline void finalizeSymtab() const;
 
 public:
   InstrProfSymtab() : VTableAddrMap(VTableAddrMapAllocator) {}
@@ -676,24 +681,25 @@ public:
   }
 
   /// Return a function's hash, or 0, if the function isn't in this SymTab.
-  LLVM_ABI uint64_t getFunctionHashFromAddress(uint64_t Address);
+  LLVM_ABI uint64_t getFunctionHashFromAddress(uint64_t Address) const;
 
   /// Return a vtable's hash, or 0 if the vtable doesn't exist in this SymTab.
-  LLVM_ABI uint64_t getVTableHashFromAddress(uint64_t Address);
+  LLVM_ABI uint64_t getVTableHashFromAddress(uint64_t Address) const;
 
   /// Return function's PGO name from the function name's symbol
   /// address in the object file. If an error occurs, return
   /// an empty string.
-  LLVM_ABI StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize);
+  LLVM_ABI StringRef getFuncName(uint64_t FuncNameAddress,
+                                 size_t NameSize) const;
 
   /// Return name of functions or global variables from the name's md5 hash
   /// value. If not found, return an empty string.
-  inline StringRef getFuncOrVarName(uint64_t ValMD5Hash);
+  inline StringRef getFuncOrVarName(uint64_t ValMD5Hash) const;
 
   /// Just like getFuncOrVarName, except that it will return literal string
   /// 'External Symbol' if the function or global variable is external to
   /// this symbol table.
-  inline StringRef getFuncOrVarNameIfDefined(uint64_t ValMD5Hash);
+  inline StringRef getFuncOrVarNameIfDefined(uint64_t ValMD5Hash) const;
 
   /// True if Symbol is the value used to represent external symbols.
   static bool isExternalSymbol(const StringRef &Symbol) {
@@ -701,11 +707,11 @@ public:
   }
 
   /// Return function from the name's md5 hash. Return nullptr if not found.
-  inline Function *getFunction(uint64_t FuncMD5Hash);
+  inline Function *getFunction(uint64_t FuncMD5Hash) const;
 
   /// Return the global variable corresponding to md5 hash. Return nullptr if
   /// not found.
-  inline GlobalVariable *getGlobalVariable(uint64_t MD5Hash);
+  inline GlobalVariable *getGlobalVariable(uint64_t MD5Hash) const;
 
   /// Return the name section data.
   inline StringRef getNameData() const { return Data; }
@@ -748,7 +754,7 @@ Error InstrProfSymtab::create(const FuncNameIterRange &FuncIterRange,
   return Error::success();
 }
 
-void InstrProfSymtab::finalizeSymtab() {
+void InstrProfSymtab::finalizeSymtab() const {
   if (Sorted)
     return;
   llvm::sort(MD5NameMap, less_first());
@@ -758,14 +764,14 @@ void InstrProfSymtab::finalizeSymtab() {
   Sorted = true;
 }
 
-StringRef InstrProfSymtab::getFuncOrVarNameIfDefined(uint64_t MD5Hash) {
-  StringRef ret = getFuncOrVarName(MD5Hash);
-  if (ret.empty())
+StringRef InstrProfSymtab::getFuncOrVarNameIfDefined(uint64_t MD5Hash) const {
+  StringRef Ret = getFuncOrVarName(MD5Hash);
+  if (Ret.empty())
     return InstrProfSymtab::getExternalSymbol();
-  return ret;
+  return Ret;
 }
 
-StringRef InstrProfSymtab::getFuncOrVarName(uint64_t MD5Hash) {
+StringRef InstrProfSymtab::getFuncOrVarName(uint64_t MD5Hash) const {
   finalizeSymtab();
   auto Result = llvm::lower_bound(MD5NameMap, MD5Hash,
                                   [](const std::pair<uint64_t, StringRef> &LHS,
@@ -775,7 +781,7 @@ StringRef InstrProfSymtab::getFuncOrVarName(uint64_t MD5Hash) {
   return StringRef();
 }
 
-Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
+Function *InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) const {
   finalizeSymtab();
   auto Result = llvm::lower_bound(MD5FuncMap, FuncMD5Hash,
                                   [](const std::pair<uint64_t, Function *> &LHS,
@@ -785,7 +791,7 @@ Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
   return nullptr;
 }
 
-GlobalVariable *InstrProfSymtab::getGlobalVariable(uint64_t MD5Hash) {
+GlobalVariable *InstrProfSymtab::getGlobalVariable(uint64_t MD5Hash) const {
   return MD5VTableMap.lookup(MD5Hash);
 }
 
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index 90eae9e..919598c 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -270,9 +270,9 @@ LLVM_ABI ProfileKind parseArchProfile(StringRef Arch);
 LLVM_ABI unsigned parseArchVersion(StringRef Arch);
 
 LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
-LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT);
-
-LLVM_ABI ARMABI computeTargetABI(const Triple &TT, StringRef ABIName = "");
+LLVM_ABI LLVM_READONLY StringRef computeDefaultTargetABI(const Triple &TT);
+LLVM_ABI LLVM_READONLY ARMABI computeTargetABI(const Triple &TT,
+                                               StringRef ABIName = "");
 
 /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
 ///
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4f04209..3141060 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1230,10 +1230,11 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
 
-InstructionCost
-TargetTransformInfo::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                               const SCEV *Ptr) const {
-  InstructionCost Cost = TTIImpl->getAddressComputationCost(PtrTy, SE, Ptr);
+InstructionCost TargetTransformInfo::getAddressComputationCost(
+    Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+    TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost =
+      TTIImpl->getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index e7b9417..2ef96cc 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -69,18 +69,10 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
   return CurIndex + 1;
 }
 
-/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
-/// EVTs that represent all the individual underlying
-/// non-aggregate types that comprise it.
-///
-/// If Offsets is non-null, it points to a vector to be filled in
-/// with the in-memory offsets of each of the individual values.
-///
-void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
-                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
-                           SmallVectorImpl<EVT> *MemVTs,
-                           SmallVectorImpl<TypeSize> *Offsets,
-                           TypeSize StartingOffset) {
+void llvm::ComputeValueTypes(const DataLayout &DL, Type *Ty,
+                             SmallVectorImpl<Type *> &Types,
+                             SmallVectorImpl<TypeSize> *Offsets,
+                             TypeSize StartingOffset) {
   assert((Ty->isScalableTy() == StartingOffset.isScalable() ||
           StartingOffset.isZero()) &&
          "Offset/TypeSize mismatch!");
@@ -90,15 +82,13 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     // us to support structs with scalable vectors for operations that don't
     // need offsets.
     const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
-    for (StructType::element_iterator EB = STy->element_begin(),
-                                      EI = EB,
+    for (StructType::element_iterator EB = STy->element_begin(), EI = EB,
                                       EE = STy->element_end();
          EI != EE; ++EI) {
       // Don't compute the element offset if we didn't get a StructLayout above.
       TypeSize EltOffset =
           SL ? SL->getElementOffset(EI - EB) : TypeSize::getZero();
-      ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets,
-                      StartingOffset + EltOffset);
+      ComputeValueTypes(DL, *EI, Types, Offsets, StartingOffset + EltOffset);
     }
     return;
   }
@@ -107,21 +97,39 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     Type *EltTy = ATy->getElementType();
     TypeSize EltSize = DL.getTypeAllocSize(EltTy);
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
-      ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets,
-                      StartingOffset + i * EltSize);
+      ComputeValueTypes(DL, EltTy, Types, Offsets,
+                        StartingOffset + i * EltSize);
     return;
   }
   // Interpret void as zero return values.
   if (Ty->isVoidTy())
     return;
-  // Base case: we can get an EVT for this LLVM IR type.
-  ValueVTs.push_back(TLI.getValueType(DL, Ty));
-  if (MemVTs)
-    MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+  Types.push_back(Ty);
   if (Offsets)
     Offsets->push_back(StartingOffset);
 }
 
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<EVT> *MemVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           TypeSize StartingOffset) {
+  SmallVector<Type *> Types;
+  ComputeValueTypes(DL, Ty, Types, Offsets, StartingOffset);
+  for (Type *Ty : Types) {
+    ValueVTs.push_back(TLI.getValueType(DL, Ty));
+    if (MemVTs)
+      MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+  }
+}
+
 void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
                            Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
                            SmallVectorImpl<EVT> *MemVTs,
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index dcfd9aa..7292bc2 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1787,10 +1787,18 @@ ReoptimizeBlock:
       // below were performed for EH "FallThrough" blocks.  Therefore, even if
       // that appears not to be happening anymore, we should assume that it is
       // possible and not remove the "!FallThrough()->isEHPad" condition below.
+      //
+      // Similarly, the analyzeBranch call does not consider callbr, which also
+      // introduces the possibility of infinite rotation, as there may be
+      // multiple successors of PrevBB. Thus we check such case by
+      // FallThrough->isInlineAsmBrIndirectTarget().
+      // NOTE: Checking if PrevBB contains callbr is more precise, but much
+      // more expensive.
       MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
-      if (FallThrough != MF.end() &&
-          !FallThrough->isEHPad() &&
+
+      if (FallThrough != MF.end() && !FallThrough->isEHPad() &&
+          !FallThrough->isInlineAsmBrIndirectTarget() &&
           !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
           PrevBB.isSuccessor(&*FallThrough)) {
         MBB->moveAfter(&MF.back());
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d9d3569..008c188 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5574,12 +5574,19 @@ LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
 
   unsigned NewElemCount =
       NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
-  LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
-
-  // Split the Src and Dst Reg into smaller registers
   SmallVector<Register> SrcVRegs, BitcastVRegs;
-  if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
-    return UnableToLegalize;
+  if (NewElemCount == 1) {
+    LLT SrcNarrowTy = SrcTy.getElementType();
+
+    auto Unmerge = MIRBuilder.buildUnmerge(SrcNarrowTy, SrcReg);
+    getUnmergeResults(SrcVRegs, *Unmerge);
+  } else {
+    LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
+
+    // Split the Src and Dst Reg into smaller registers
+    if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
+      return UnableToLegalize;
+  }
 
   // Build new smaller bitcast instructions
   // Not supporting Leftover types for now but will have to
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index a93a89e..34a9d5d 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -72,6 +73,7 @@ class PHIEliminationImpl {
   LiveIntervals *LIS = nullptr;
   MachineLoopInfo *MLI = nullptr;
   MachineDominatorTree *MDT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
 
   /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
   /// in predecessor basic blocks.
@@ -123,17 +125,22 @@ public:
     auto *MLIWrapper = P->getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
     auto *MDTWrapper =
         P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+    auto *PDTWrapper =
+        P->getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
     LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
     LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
     MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr;
     MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
+    PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
   }
 
   PHIEliminationImpl(MachineFunction &MF, MachineFunctionAnalysisManager &AM)
       : LV(AM.getCachedResult<LiveVariablesAnalysis>(MF)),
         LIS(AM.getCachedResult<LiveIntervalsAnalysis>(MF)),
         MLI(AM.getCachedResult<MachineLoopAnalysis>(MF)),
-        MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)), MFAM(&AM) {}
+        MDT(AM.getCachedResult<MachineDominatorTreeAnalysis>(MF)),
+        PDT(AM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF)),
+        MFAM(&AM) {}
 
   bool run(MachineFunction &MF);
 };
@@ -172,6 +179,7 @@ PHIEliminationPass::run(MachineFunction &MF,
   PA.preserve<LiveVariablesAnalysis>();
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachinePostDominatorTreeAnalysis>();
   PA.preserve<MachineLoopAnalysis>();
   return PA;
 }
@@ -197,6 +205,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexesWrapperPass>();
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -204,15 +213,8 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
 bool PHIEliminationImpl::run(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
 
-  MachineDominatorTree *MDT = nullptr;
-  if (P) {
-    auto *MDTWrapper =
-        P->getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
-    MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
-  } else {
-    MDT = MFAM->getCachedResult<MachineDominatorTreeAnalysis>(MF);
-  }
-  MachineDomTreeUpdater MDTU(MDT, MachineDomTreeUpdater::UpdateStrategy::Lazy);
+  MachineDomTreeUpdater MDTU(MDT, PDT,
+                             MachineDomTreeUpdater::UpdateStrategy::Lazy);
 
   bool Changed = false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 763b386..1a63518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -81,12 +81,11 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
 
 /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
 /// implicit physical register output.
-void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                                   Register SrcReg, VRBaseMapType &VRBaseMap) {
+void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+                                   VRBaseMapType &VRBaseMap) {
   Register VRBase;
   if (SrcReg.isVirtual()) {
     // Just use the input register directly!
-    SDValue Op(Node, ResNo);
     if (IsClone)
       VRBaseMap.erase(Op);
     bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
@@ -99,17 +98,15 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
   // the CopyToReg'd destination register instead of creating a new vreg.
   bool MatchReg = true;
   const TargetRegisterClass *UseRC = nullptr;
-  MVT VT = Node->getSimpleValueType(ResNo);
+  MVT VT = Op.getSimpleValueType();
 
   // Stick to the preferred register classes for legal types.
   if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
+    UseRC = TLI->getRegClassFor(VT, Op->isDivergent());
 
-  for (SDNode *User : Node->users()) {
+  for (SDNode *User : Op->users()) {
     bool Match = true;
-    if (User->getOpcode() == ISD::CopyToReg &&
-        User->getOperand(2).getNode() == Node &&
-        User->getOperand(2).getResNo() == ResNo) {
+    if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2) == Op) {
       Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
       if (DestReg.isVirtual()) {
         VRBase = DestReg;
@@ -118,10 +115,8 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
         Match = false;
     } else {
       for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
-        SDValue Op = User->getOperand(i);
-        if (Op.getNode() != Node || Op.getResNo() != ResNo)
+        if (User->getOperand(i) != Op)
           continue;
-        MVT VT = Node->getSimpleValueType(Op.getResNo());
         if (VT == MVT::Other || VT == MVT::Glue)
           continue;
         Match = false;
@@ -170,11 +165,11 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
   } else {
     // Create the reg, emit the copy.
     VRBase = MRI->createVirtualRegister(DstRC);
-    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
-            VRBase).addReg(SrcReg);
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase)
+        .addReg(SrcReg);
   }
 
-  SDValue Op(Node, ResNo);
   if (IsClone)
     VRBaseMap.erase(Op);
   bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
@@ -1170,7 +1165,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
         continue;
       // This implicitly defined physreg has a use.
       UsedRegs.push_back(Reg);
-      EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap);
+      EmitCopyFromReg(SDValue(Node, i), IsClone, Reg, VRBaseMap);
     }
   }
 
@@ -1283,7 +1278,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
   }
   case ISD::CopyFromReg: {
     Register SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
-    EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap);
+    EmitCopyFromReg(SDValue(Node, 0), IsClone, SrcReg, VRBaseMap);
     break;
   }
   case ISD::EH_LABEL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 16d754c..b465de8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -48,8 +48,8 @@ private:
 
   /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
   /// implicit physical register output.
-  void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
-                       Register SrcReg, VRBaseMapType &VRBaseMap);
+  void EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
+                       VRBaseMapType &VRBaseMap);
 
   void CreateVirtualRegisters(SDNode *Node,
                               MachineInstrBuilder &MIB,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2cad36e..83bb1df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -197,7 +197,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(0 + Offset).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -218,7 +218,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) {
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -558,7 +558,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType(),
                    N->getOperand(2 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG,
                                                     GetFPLibCall(N->getValueType(0),
                                                                  RTLIB::FMA_F32,
@@ -642,7 +642,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
                                   CallOptions, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
@@ -694,7 +694,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -742,7 +742,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) {
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
                    N->getOperand(1 + Offset).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -779,7 +779,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
 
   // TODO: setTypeListBeforeSoften can't properly express multiple return types,
   // but we only really need to handle the 0th one for softening anyway.
-  CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true)
+  CallOptions.setTypeListBeforeSoften({OpsVT}, VT0)
       .setOpsTypeOverrides(CallOpsTypeOverrides);
 
   auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL,
@@ -828,7 +828,7 @@ bool DAGTypeLegalizer::SoftenFloatRes_UnaryWithTwoFPResults(
   TargetLowering::MakeLibCallOptions CallOptions;
   // TODO: setTypeListBeforeSoften can't properly express multiple return types,
   // but since both returns have the same type it should be okay.
-  CallOptions.setTypeListBeforeSoften({OpsVT}, VT, true)
+  CallOptions.setTypeListBeforeSoften({OpsVT}, VT)
       .setOpsTypeOverrides(CallOpsTypeOverrides);
 
   auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, DL,
@@ -1100,7 +1100,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(IsStrict ? 1 : 0));
   TargetLowering::MakeLibCallOptions CallOptions;
   CallOptions.setIsSigned(Signed);
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp =
       TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
                       Op, CallOptions, dl, Chain);
@@ -1222,7 +1222,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   Op = GetSoftenedFloat(Op);
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -1298,7 +1298,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   Op = GetSoftenedFloat(Op);
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT);
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, dl, Chain);
 
@@ -1453,7 +1453,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) {
   SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpVT = N->getOperand(0 + Offset).getValueType();
-  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
                                                     CallOptions, SDLoc(N),
                                                     Chain);
@@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
   case ISD::STRICT_FMINNUM:
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
@@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves.
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -3559,7 +3567,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
     Op = GetSoftenedFloat(Op);
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+    CallOptions.setTypeListBeforeSoften(SVT, RVT);
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N), Chain);
     if (IsStrict)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 63544e6..33fa301 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -681,6 +681,7 @@ private:
       SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
 
   // clang-format off
+  void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FACOS     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FASIN     (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 68ea72c..4b7fc45 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5460,6 +5460,83 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     }
     return true;
 
+  case ISD::EXTRACT_SUBVECTOR: {
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+    return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+                                            Depth + 1);
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
+
+    if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison(
+                                 Sub, DemandedSubElts, PoisonOnly, Depth + 1))
+      return false;
+    if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison(
+                                 Src, DemandedSrcElts, PoisonOnly, Depth + 1))
+      return false;
+    return true;
+  }
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Src = Op.getOperand(0);
+    auto *IndexC = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.isFixedLengthVector() && IndexC &&
+        IndexC->getAPIntValue().ult(SrcVT.getVectorNumElements())) {
+      APInt DemandedSrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
+                                                  IndexC->getZExtValue());
+      return isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrcElts, PoisonOnly,
+                                              Depth + 1);
+    }
+    break;
+  }
+
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    EVT VT = InVec.getValueType();
+    auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+    if (IndexC && VT.isFixedLengthVector() &&
+        IndexC->getAPIntValue().ult(VT.getVectorNumElements())) {
+      if (DemandedElts[IndexC->getZExtValue()] &&
+          !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1))
+        return false;
+      APInt InVecDemandedElts = DemandedElts;
+      InVecDemandedElts.clearBit(IndexC->getZExtValue());
+      if (!!InVecDemandedElts &&
+          !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts,
+                                            PoisonOnly, Depth + 1))
+        return false;
+      return true;
+    }
+    break;
+  }
+
+  case ISD::SCALAR_TO_VECTOR:
+    // Check upper (known undef) elements.
+    if (DemandedElts.ugt(1) && !PoisonOnly)
+      return false;
+    // Check element zero.
+    if (DemandedElts[0] && !isGuaranteedNotToBeUndefOrPoison(
+                               Op.getOperand(0), PoisonOnly, Depth + 1))
+      return false;
+    return true;
+
   case ISD::SPLAT_VECTOR:
     return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
                                             Depth + 1);
@@ -5482,6 +5559,52 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     return true;
   }
 
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+    // Shift amount operand is checked by canCreateUndefOrPoison. So it is
+    // enough to check operand 0 if Op can't create undef/poison.
+    return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+                                   /*ConsiderFlags*/ true, Depth) &&
+           isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts,
+                                            PoisonOnly, Depth + 1);
+
+  case ISD::BSWAP:
+  case ISD::CTPOP:
+  case ISD::BITREVERSE:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::VSELECT: {
+    // If Op can't create undef/poison and none of its operands are undef/poison
+    // then Op is never undef/poison. A difference from the more common check
+    // below, outside the switch, is that we handle elementwise operations for
+    // which the DemandedElts mask is valid for all operands here.
+    return !canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly,
+                                   /*ConsiderFlags*/ true, Depth) &&
+           all_of(Op->ops(), [&](SDValue V) {
+             return isGuaranteedNotToBeUndefOrPoison(V, DemandedElts,
+                                                     PoisonOnly, Depth + 1);
+           });
+  }
+
     // TODO: Search for noundef attributes from library functions.
 
     // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a71b440..366a230 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2211,9 +2211,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
-    unsigned NumValues = ValueVTs.size();
+    SmallVector<Type *, 4> Types;
+    ComputeValueTypes(DL, I.getOperand(0)->getType(), Types);
+    unsigned NumValues = Types.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
 
@@ -2233,7 +2233,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
-        EVT VT = ValueVTs[j];
+        EVT VT = TLI.getValueType(DL, Types[j]);
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
           VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
@@ -2275,7 +2275,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags,
                                         Parts[i].getValueType().getSimpleVT(),
-                                        VT, I.getOperand(0)->getType(), 0, 0));
+                                        VT, Types[j], 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -10983,15 +10983,21 @@ std::pair<SDValue, SDValue>
 TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle the incoming return values from the call.
   CLI.Ins.clear();
-  SmallVector<EVT, 4> RetTys;
+  SmallVector<Type *, 4> RetOrigTys;
   SmallVector<TypeSize, 4> Offsets;
   auto &DL = CLI.DAG.getDataLayout();
-  ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
+  ComputeValueTypes(DL, CLI.RetTy, RetOrigTys, &Offsets);
+
+  SmallVector<EVT, 4> RetTys;
+  for (Type *Ty : RetOrigTys)
+    RetTys.push_back(getValueType(DL, Ty));
 
   if (CLI.IsPostTypeLegalization) {
     // If we are lowering a libcall after legalization, split the return type.
+    SmallVector<Type *, 4> OldRetOrigTys;
     SmallVector<EVT, 4> OldRetTys;
     SmallVector<TypeSize, 4> OldOffsets;
+    RetOrigTys.swap(OldRetOrigTys);
     RetTys.swap(OldRetTys);
     Offsets.swap(OldOffsets);
 
@@ -11001,6 +11007,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
       unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
       unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
+      RetOrigTys.append(NumRegs, OldRetOrigTys[i]);
       RetTys.append(NumRegs, RegisterVT);
       for (unsigned j = 0; j != NumRegs; ++j)
         Offsets.push_back(TypeSize::getFixed(Offset + j * RegisterVTByteSZ));
@@ -11069,7 +11076,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
                                                        CLI.CallConv, VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg Ret(Flags, RegisterVT, VT, CLI.RetTy,
+        ISD::InputArg Ret(Flags, RegisterVT, VT, RetOrigTys[I],
                           CLI.IsReturnValueUsed, ISD::InputArg::NoArgIndex, 0);
         if (CLI.RetTy->isPointerTy()) {
           Ret.Flags.setPointer();
@@ -11106,18 +11113,18 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   CLI.Outs.clear();
   CLI.OutVals.clear();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+    SmallVector<Type *, 4> ArgTys;
+    ComputeValueTypes(DL, Args[i].Ty, ArgTys);
     // FIXME: Split arguments if CLI.IsPostTypeLegalization
     Type *FinalType = Args[i].Ty;
     if (Args[i].IsByVal)
       FinalType = Args[i].IndirectType;
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg, DL);
-    for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+    for (unsigned Value = 0, NumValues = ArgTys.size(); Value != NumValues;
          ++Value) {
-      EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+      Type *ArgTy = ArgTys[Value];
+      EVT VT = getValueType(DL, ArgTy);
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
@@ -11130,10 +11137,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
       if (i >= CLI.NumFixedArgs)
         Flags.setVarArg();
-      if (Args[i].Ty->isPointerTy()) {
+      if (ArgTy->isPointerTy()) {
         Flags.setPointer();
-        Flags.setPointerAddrSpace(
-            cast<PointerType>(Args[i].Ty)->getAddressSpace());
+        Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
       }
       if (Args[i].IsZExt)
         Flags.setZExt();
@@ -11252,7 +11258,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // For scalable vectors the scalable part is currently handled
         // by individual targets, so we just use the known minimum size here.
         ISD::OutputArg MyFlags(
-            Flags, Parts[j].getValueType().getSimpleVT(), VT, Args[i].Ty, i,
+            Flags, Parts[j].getValueType().getSimpleVT(), VT, ArgTy, i,
             j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
@@ -11645,8 +11651,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   // Set up the incoming argument description vector.
   for (const Argument &Arg : F.args()) {
     unsigned ArgNo = Arg.getArgNo();
-    SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    SmallVector<Type *, 4> Types;
+    ComputeValueTypes(DAG.getDataLayout(), Arg.getType(), Types);
     bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
     Type *FinalType = Arg.getType();
@@ -11654,17 +11660,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       FinalType = Arg.getParamByValType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
         FinalType, F.getCallingConv(), F.isVarArg(), DL);
-    for (unsigned Value = 0, NumValues = ValueVTs.size();
-         Value != NumValues; ++Value) {
-      EVT VT = ValueVTs[Value];
-      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+    for (unsigned Value = 0, NumValues = Types.size(); Value != NumValues;
+         ++Value) {
+      Type *ArgTy = Types[Value];
+      EVT VT = TLI->getValueType(DL, ArgTy);
       ISD::ArgFlagsTy Flags;
 
-
-      if (Arg.getType()->isPointerTy()) {
+      if (ArgTy->isPointerTy()) {
         Flags.setPointer();
-        Flags.setPointerAddrSpace(
-            cast<PointerType>(Arg.getType())->getAddressSpace());
+        Flags.setPointerAddrSpace(cast<PointerType>(ArgTy)->getAddressSpace());
       }
       if (Arg.hasAttribute(Attribute::ZExt))
         Flags.setZExt();
@@ -11768,7 +11772,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // are responsible for handling scalable vector arguments and
         // return values.
         ISD::InputArg MyFlags(
-            Flags, RegisterVT, VT, Arg.getType(), isArgValueUsed, ArgNo,
+            Flags, RegisterVT, VT, ArgTy, isArgValueUsed, ArgNo,
             PartBase + i * RegisterVT.getStoreSize().getKnownMinValue());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8fbabfa..911bbab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -420,7 +420,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   TargetLowering::MakeLibCallOptions CallOptions;
   EVT OpsVT[2] = { OldLHS.getValueType(),
                    OldRHS.getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT);
   auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain);
   NewLHS = Call.first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 61ff2df..350948a 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1738,13 +1738,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
                          AttributeList attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI, const DataLayout &DL) {
-  SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
-  unsigned NumValues = ValueVTs.size();
+  SmallVector<Type *, 4> Types;
+  ComputeValueTypes(DL, ReturnType, Types);
+  unsigned NumValues = Types.size();
   if (NumValues == 0) return;
 
-  for (unsigned j = 0, f = NumValues; j != f; ++j) {
-    EVT VT = ValueVTs[j];
+  for (Type *Ty : Types) {
+    EVT VT = TLI.getValueType(DL, Ty);
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
     if (attr.hasRetAttr(Attribute::SExt))
@@ -1772,7 +1772,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, ReturnType, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, Ty, 0, 0));
   }
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 19c000e..d460cf6 100644
--- a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -14,40 +14,39 @@
 namespace llvm {
 namespace orc {
 
-ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
-                                ThreadSafeContext TSCtx,
-                                GVPredicate ShouldCloneDef,
-                                GVModifier UpdateClonedDefSource) {
-  assert(TSM && "Can not clone null module");
-
-  if (!ShouldCloneDef)
-    ShouldCloneDef = [](const GlobalValue &) { return true; };
-
-  // First copy the source module into a buffer.
+static std::pair<std::string, SmallVector<char, 1>>
+serializeModule(const Module &M, GVPredicate ShouldCloneDef,
+                GVModifier UpdateClonedDefSource) {
   std::string ModuleName;
   SmallVector<char, 1> ClonedModuleBuffer;
-  TSM.withModuleDo([&](Module &M) {
-    ModuleName = M.getModuleIdentifier();
-    std::set<GlobalValue *> ClonedDefsInSrc;
-    ValueToValueMapTy VMap;
-    auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
-      if (ShouldCloneDef(*GV)) {
-        ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
-        return true;
-      }
-      return false;
-    });
-
-    if (UpdateClonedDefSource)
-      for (auto *GV : ClonedDefsInSrc)
-        UpdateClonedDefSource(*GV);
-
-    BitcodeWriter BCWriter(ClonedModuleBuffer);
-    BCWriter.writeModule(*Tmp);
-    BCWriter.writeSymtab();
-    BCWriter.writeStrtab();
+
+  ModuleName = M.getModuleIdentifier();
+  std::set<GlobalValue *> ClonedDefsInSrc;
+  ValueToValueMapTy VMap;
+  auto Tmp = CloneModule(M, VMap, [&](const GlobalValue *GV) {
+    if (ShouldCloneDef(*GV)) {
+      ClonedDefsInSrc.insert(const_cast<GlobalValue *>(GV));
+      return true;
+    }
+    return false;
   });
 
+  if (UpdateClonedDefSource)
+    for (auto *GV : ClonedDefsInSrc)
+      UpdateClonedDefSource(*GV);
+
+  BitcodeWriter BCWriter(ClonedModuleBuffer);
+  BCWriter.writeModule(*Tmp);
+  BCWriter.writeSymtab();
+  BCWriter.writeStrtab();
+
+  return {std::move(ModuleName), std::move(ClonedModuleBuffer)};
+}
+
+ThreadSafeModule
+deserializeModule(std::string ModuleName,
+                  const SmallVector<char, 1> &ClonedModuleBuffer,
+                  ThreadSafeContext TSCtx) {
   MemoryBufferRef ClonedModuleBufferRef(
       StringRef(ClonedModuleBuffer.data(), ClonedModuleBuffer.size()),
       "cloned module buffer");
@@ -63,6 +62,40 @@ ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
   return ThreadSafeModule(std::move(M), std::move(TSCtx));
 }
 
+ThreadSafeModule
+cloneExternalModuleToContext(const Module &M, ThreadSafeContext TSCtx,
+                             GVPredicate ShouldCloneDef,
+                             GVModifier UpdateClonedDefSource) {
+
+  if (!ShouldCloneDef)
+    ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+  auto [ModuleName, ClonedModuleBuffer] = serializeModule(
+      M, std::move(ShouldCloneDef), std::move(UpdateClonedDefSource));
+
+  return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+                           std::move(TSCtx));
+}
+
+ThreadSafeModule cloneToContext(const ThreadSafeModule &TSM,
+                                ThreadSafeContext TSCtx,
+                                GVPredicate ShouldCloneDef,
+                                GVModifier UpdateClonedDefSource) {
+  assert(TSM && "Can not clone null module");
+
+  if (!ShouldCloneDef)
+    ShouldCloneDef = [](const GlobalValue &) { return true; };
+
+  // First copy the source module into a buffer.
+  auto [ModuleName, ClonedModuleBuffer] = TSM.withModuleDo([&](Module &M) {
+    return serializeModule(M, std::move(ShouldCloneDef),
+                           std::move(UpdateClonedDefSource));
+  });
+
+  return deserializeModule(std::move(ModuleName), ClonedModuleBuffer,
+                           std::move(TSCtx));
+}
+
 ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM,
                                    GVPredicate ShouldCloneDef,
                                    GVModifier UpdateClonedDefSource) {
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index f7ef4aa..8b5965b 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2186,6 +2186,11 @@ void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind,
   unwrap<GlobalObject>(Global)->setMetadata(Kind, unwrap<MDNode>(MD));
 }
 
+void LLVMGlobalAddMetadata(LLVMValueRef Global, unsigned Kind,
+                           LLVMMetadataRef MD) {
+  unwrap<GlobalObject>(Global)->addMetadata(Kind, *unwrap<MDNode>(MD));
+}
+
 void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) {
   unwrap<GlobalObject>(Global)->eraseMetadata(Kind);
 }
@@ -2194,6 +2199,11 @@ void LLVMGlobalClearMetadata(LLVMValueRef Global) {
   unwrap<GlobalObject>(Global)->clearMetadata();
 }
 
+void LLVMGlobalAddDebugInfo(LLVMValueRef Global, LLVMMetadataRef GVE) {
+  unwrap<GlobalVariable>(Global)->addDebugInfo(
+      unwrap<DIGlobalVariableExpression>(GVE));
+}
+
 /*--.. Operations on global variables ......................................--*/
 
 LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 65d4840..e9425e1 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1915,23 +1915,16 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
 }
 
 void at::deleteAll(Function *F) {
-  SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
-  SmallVector<DbgVariableRecord *, 12> DPToDelete;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : BB) {
-      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      for (DbgVariableRecord &DVR :
+           make_early_inc_range(filterDbgVars(I.getDbgRecordRange())))
         if (DVR.isDbgAssign())
-          DPToDelete.push_back(&DVR);
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-        ToDelete.push_back(DAI);
-      else
-        I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
+          DVR.eraseFromParent();
+
+      I.setMetadata(LLVMContext::MD_DIAssignID, nullptr);
     }
   }
-  for (auto *DAI : ToDelete)
-    DAI->eraseFromParent();
-  for (auto *DVR : DPToDelete)
-    DVR->eraseFromParent();
 }
 
 /// FIXME: Remove this wrapper function and call
@@ -2008,8 +2001,6 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
   }
   if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
     I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
-  else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-    DAI->setAssignId(GetNewID(DAI->getAssignID()));
 }
 
 /// Collect constant properies (base, size, offset) of \p StoreDest.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index a8e6c79..ac845c4 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/ADT/StringTable.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/ARMTargetParser.h"
 
 #define DEBUG_TYPE "runtime-libcalls-info"
 
@@ -21,61 +22,33 @@ using namespace RTLIB;
 #undef GET_INIT_RUNTIME_LIBCALL_NAMES
 #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 
-static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
-                               FloatABI::ABIType FloatABIType,
-                               EABI EABIVersion) {
-  static const RTLIB::LibcallImpl AAPCS_Libcalls[] = {
-      RTLIB::__aeabi_dadd,        RTLIB::__aeabi_ddiv,
-      RTLIB::__aeabi_dmul,        RTLIB::__aeabi_dsub,
-      RTLIB::__aeabi_dcmpeq__oeq, RTLIB::__aeabi_dcmpeq__une,
-      RTLIB::__aeabi_dcmplt,      RTLIB::__aeabi_dcmple,
-      RTLIB::__aeabi_dcmpge,      RTLIB::__aeabi_dcmpgt,
-      RTLIB::__aeabi_dcmpun,      RTLIB::__aeabi_fadd,
-      RTLIB::__aeabi_fdiv,        RTLIB::__aeabi_fmul,
-      RTLIB::__aeabi_fsub,        RTLIB::__aeabi_fcmpeq__oeq,
-      RTLIB::__aeabi_fcmpeq__une, RTLIB::__aeabi_fcmplt,
-      RTLIB::__aeabi_fcmple,      RTLIB::__aeabi_fcmpge,
-      RTLIB::__aeabi_fcmpgt,      RTLIB::__aeabi_fcmpun,
-      RTLIB::__aeabi_d2iz,        RTLIB::__aeabi_d2uiz,
-      RTLIB::__aeabi_d2lz,        RTLIB::__aeabi_d2ulz,
-      RTLIB::__aeabi_f2iz,        RTLIB::__aeabi_f2uiz,
-      RTLIB::__aeabi_f2lz,        RTLIB::__aeabi_f2ulz,
-      RTLIB::__aeabi_d2f,         RTLIB::__aeabi_d2h,
-      RTLIB::__aeabi_f2d,         RTLIB::__aeabi_i2d,
-      RTLIB::__aeabi_ui2d,        RTLIB::__aeabi_l2d,
-      RTLIB::__aeabi_ul2d,        RTLIB::__aeabi_i2f,
-      RTLIB::__aeabi_ui2f,        RTLIB::__aeabi_l2f,
-      RTLIB::__aeabi_ul2f,        RTLIB::__aeabi_lmul,
-      RTLIB::__aeabi_llsl,        RTLIB::__aeabi_llsr,
-      RTLIB::__aeabi_lasr,        RTLIB::__aeabi_idiv,
-      RTLIB::__aeabi_idivmod,     RTLIB::__aeabi_uidivmod,
-      RTLIB::__aeabi_ldivmod,     RTLIB::__aeabi_uidiv,
-      RTLIB::__aeabi_uldivmod,    RTLIB::__aeabi_f2h,
-      RTLIB::__aeabi_d2h,         RTLIB::__aeabi_h2f,
-      RTLIB::__aeabi_memcpy,      RTLIB::__aeabi_memmove,
-      RTLIB::__aeabi_memset,      RTLIB::__aeabi_memcpy4,
-      RTLIB::__aeabi_memcpy8,     RTLIB::__aeabi_memmove4,
-      RTLIB::__aeabi_memmove8,    RTLIB::__aeabi_memset4,
-      RTLIB::__aeabi_memset8,     RTLIB::__aeabi_memclr,
-      RTLIB::__aeabi_memclr4,     RTLIB::__aeabi_memclr8};
-
-  for (RTLIB::LibcallImpl Impl : AAPCS_Libcalls)
-    Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
-}
-
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
                                        ExceptionHandling ExceptionModel,
                                        FloatABI::ABIType FloatABI,
                                        EABI EABIVersion, StringRef ABIName) {
-  setTargetRuntimeLibcallSets(TT, FloatABI);
+  setTargetRuntimeLibcallSets(TT, FloatABI, EABIVersion, ABIName);
 
   if (ExceptionModel == ExceptionHandling::SjLj)
     setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume);
 
   if (TT.isARM() || TT.isThumb()) {
-    setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
+    // The half <-> float conversion functions are always soft-float on
+    // non-watchos platforms, but are needed for some targets which use a
+    // hard-float calling convention by default.
+    if (!TT.isWatchABI()) {
+      if (isAAPCS_ABI(TT, ABIName)) {
+        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
+        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
+      } else {
+        setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
+        setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
+      }
+    }
+
     return;
   }
 
@@ -130,6 +103,11 @@ RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
   return make_range(EntriesBegin, EntriesEnd);
 }
 
+bool RuntimeLibcallsInfo::isAAPCS_ABI(const Triple &TT, StringRef ABIName) {
+  const ARM::ARMABI TargetABI = ARM::computeTargetABI(TT, ABIName);
+  return TargetABI == ARM::ARM_ABI_AAPCS || TargetABI == ARM::ARM_ABI_AAPCS16;
+}
+
 bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
   switch (TT.getOS()) {
   case Triple::MacOSX:
diff --git a/llvm/lib/MCA/Instruction.cpp b/llvm/lib/MCA/Instruction.cpp
index d4adfce..7966708 100644
--- a/llvm/lib/MCA/Instruction.cpp
+++ b/llvm/lib/MCA/Instruction.cpp
@@ -128,6 +128,13 @@ void WriteState::dump() const {
 }
 #endif
 
+#ifndef NDEBUG
+void ReadState::dump() const {
+  dbgs() << "{ OpIdx=" << RD->OpIndex << ", RegID " << getRegisterID()
+         << ", Cycles Left=" << CyclesLeft << " }";
+}
+#endif
+
 const CriticalDependency &Instruction::computeCriticalRegDep() {
   if (CriticalRegDep.Cycles)
     return CriticalRegDep;
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index cdf4412..fc2577e 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -519,7 +519,7 @@ Error InstrProfSymtab::create(SectionRef &Section) {
   return Error::success();
 }
 
-StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) {
+StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) const {
   if (Pointer < Address)
     return StringRef();
   auto Offset = Pointer - Address;
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 5425729..7885e12 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -684,13 +684,13 @@ Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName,
   return Error::success();
 }
 
-uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) const {
   // Given a runtime address, look up the hash value in the interval map, and
   // fallback to value 0 if a hash value is not found.
   return VTableAddrMap.lookup(Address, 0);
 }
 
-uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) {
+uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) const {
   finalizeSymtab();
   auto It = partition_point(AddrToMD5Map, [=](std::pair<uint64_t, uint64_t> A) {
     return A.first < Address;
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 954af7f..1547f48 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -3136,6 +3136,22 @@ APInt APIntOps::mulhu(const APInt &C1, const APInt &C2) {
   return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
 }
 
+APInt APIntOps::mulsExtended(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.sext(FullWidth);
+  APInt C2Ext = C2.sext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
+APInt APIntOps::muluExtended(const APInt &C1, const APInt &C2) {
+  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
+  unsigned FullWidth = C1.getBitWidth() * 2;
+  APInt C1Ext = C1.zext(FullWidth);
+  APInt C2Ext = C2.zext(FullWidth);
+  return C1Ext * C2Ext;
+}
+
 APInt APIntOps::pow(const APInt &X, int64_t N) {
   assert(N >= 0 && "negative exponents not supported.");
   APInt Acc = APInt(X.getBitWidth(), 1);
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 601f11f..1c4645a 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
     std::unique_ptr<MB> Result(
         new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
             RequiresNullTerminator, FD, MapSize, Offset, EC));
-    if (!EC)
-      return std::move(Result);
+    if (!EC) {
+      // On at least Linux, and possibly on other systems, mmap may return pages
+      // from the page cache that are not properly filled with trailing zeroes,
+      // if some prior user of the page wrote non-zero bytes. Detect this and
+      // don't use mmap in that case.
+      if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0')
+        return std::move(Result);
+    }
   }
 
 #ifdef __MVS__
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ba02c82..885f2a9 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1487,11 +1487,8 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
 
     if (Opc == AArch64::BL) {
       auto Op1 = MBBI->getOperand(0);
-      auto &TLI =
-          *MBBI->getMF()->getSubtarget<AArch64Subtarget>().getTargetLowering();
-      char const *GetCurrentVG =
-          TLI.getLibcallName(RTLIB::SMEABI_GET_CURRENT_VG);
-      return Op1.isSymbol() && StringRef(Op1.getSymbolName()) == GetCurrentVG;
+      return Op1.isSymbol() &&
+             (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
     }
   }
 
@@ -3471,7 +3468,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool NeedsWinCFI = needsWinCFI(MF);
@@ -3585,11 +3581,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
               .addReg(AArch64::X0, RegState::Implicit)
               .setMIFlag(MachineInstr::FrameSetup);
 
-        RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
-        const uint32_t *RegMask =
-            TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC));
+        const uint32_t *RegMask = TRI->getCallPreservedMask(
+            MF,
+            CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
         BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
-            .addExternalSymbol(TLI.getLibcallName(LC))
+            .addExternalSymbol("__arm_get_current_vg")
             .addRegMask(RegMask)
             .addReg(AArch64::X0, RegState::ImplicitDefine)
             .setMIFlag(MachineInstr::FrameSetup);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 224bbe7..2072e48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3083,12 +3083,13 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   if (FuncInfo->isSMESaveBufferUsed()) {
-    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
-        .addExternalSymbol(getLibcallName(LC))
+        .addExternalSymbol("__arm_sme_state_size")
         .addReg(AArch64::X0, RegState::ImplicitDefine)
-        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+        .addRegMask(TRI->getCallPreservedMask(
+            *MF, CallingConv::
+                     AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
             MI.getOperand(0).getReg())
         .addReg(AArch64::X0);
@@ -3108,12 +3109,13 @@ AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   Register ResultReg = MI.getOperand(0).getReg();
   if (FuncInfo->isPStateSMRegUsed()) {
-    RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
-        .addExternalSymbol(getLibcallName(LC))
+        .addExternalSymbol("__arm_sme_state")
         .addReg(AArch64::X0, RegState::ImplicitDefine)
-        .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
+        .addRegMask(TRI->getCallPreservedMask(
+            *MF, CallingConv::
+                     AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2));
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
         .addReg(AArch64::X0);
   } else {
@@ -5737,15 +5739,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
                                                   SDValue Chain, SDLoc DL,
                                                   EVT VT) const {
-  RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
-  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+  SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
                                          getPointerTy(DAG.getDataLayout()));
   Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
   Type *RetTy = StructType::get(Int64Ty, Int64Ty);
   TargetLowering::CallLoweringInfo CLI(DAG);
   ArgListTy Args;
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
+      RetTy, Callee, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -8598,12 +8600,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
 }
 
 static SMECallAttrs
-getSMECallAttrs(const Function &Caller, const TargetLowering &TLI,
+getSMECallAttrs(const Function &Caller,
                 const TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.CB)
-    return SMECallAttrs(*CLI.CB, &TLI);
+    return SMECallAttrs(*CLI.CB);
   if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
-    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
+    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol()));
   return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
 }
 
@@ -8625,7 +8627,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // SME Streaming functions are not eligible for TCO as they may require
   // the streaming mode or ZA to be restored after returning from the call.
-  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
+  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
       CallAttrs.caller().hasStreamingBody())
@@ -8919,14 +8921,14 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
       DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
   Args.push_back(Entry);
 
-  RTLIB::Libcall LC =
-      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue Callee =
+      DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
+                            TLI.getPointerTy(DAG.getDataLayout()));
   auto *RetTy = Type::getVoidTy(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy,
+      Callee, std::move(Args));
   return TLI.LowerCallTo(CLI).second;
 }
 
@@ -9114,7 +9116,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Determine whether we need any streaming mode changes.
-  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI);
 
   auto DescribeCallsite =
       [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
@@ -9691,12 +9693,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
-    RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
     TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
-        TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
+        TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
-        getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+        "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
     SDValue TPIDR2_EL0 = DAG.getNode(
         ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
         DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
@@ -29035,7 +29036,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
-    auto CallAttrs = SMECallAttrs(*Base, this);
+    auto CallAttrs = SMECallAttrs(*Base);
     if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingZT0() ||
         CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb59c9f..a55f103 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5920,7 +5920,7 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
   // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
   SmallString<64> Expr;
   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
-  assert(DwarfReg >= 0 && DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
+  assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
   // Reg + NumBytes
   Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
   appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4523c65..3fba7e8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -220,16 +220,20 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
 
-static bool isSMEABIRoutineCall(const CallInst &CI, const TargetLowering &TLI) {
+static bool isSMEABIRoutineCall(const CallInst &CI) {
   const auto *F = CI.getCalledFunction();
-  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+  return F && StringSwitch<bool>(F->getName())
+                  .Case("__arm_sme_state", true)
+                  .Case("__arm_tpidr2_save", true)
+                  .Case("__arm_tpidr2_restore", true)
+                  .Case("__arm_za_disable", true)
+                  .Default(false);
 }
 
 /// Returns true if the function has explicit operations that can only be
 /// lowered using incompatible instructions for the selected mode. This also
 /// returns true if the function F may use or modify ZA state.
-static bool hasPossibleIncompatibleOps(const Function *F,
-                                       const TargetLowering &TLI) {
+static bool hasPossibleIncompatibleOps(const Function *F) {
   for (const BasicBlock &BB : *F) {
     for (const Instruction &I : BB) {
       // Be conservative for now and assume that any call to inline asm or to
@@ -238,7 +242,7 @@ static bool hasPossibleIncompatibleOps(const Function *F,
       // all native LLVM instructions can be lowered to compatible instructions.
       if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
           (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
-           isSMEABIRoutineCall(cast<CallInst>(I), TLI)))
+           isSMEABIRoutineCall(cast<CallInst>(I))))
         return true;
     }
   }
@@ -286,7 +290,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
       CallAttrs.requiresPreservingZT0() ||
       CallAttrs.requiresPreservingAllZAState()) {
-    if (hasPossibleIncompatibleOps(Callee, *getTLI()))
+    if (hasPossibleIncompatibleOps(Callee))
       return false;
   }
 
@@ -353,7 +357,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
   // change only once and avoid inlining of G into F.
 
   SMEAttrs FAttrs(*F);
-  SMECallAttrs CallAttrs(Call, getTLI());
+  SMECallAttrs CallAttrs(Call);
 
   if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
     if (F == Call.getCaller()) // (1)
@@ -4333,7 +4337,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 
 InstructionCost
 AArch64TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                          const SCEV *Ptr) const {
+                                          const SCEV *Ptr,
+                                          TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 647b242..9c96fdd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -238,8 +238,9 @@ public:
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
 
-  InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getCmpSelInstrCost(
       unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 2008516..4af4d49 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -15,16 +15,11 @@
 #include "AArch64.h"
 #include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/RuntimeLibcalls.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
@@ -38,13 +33,9 @@ struct SMEABI : public FunctionPass {
 
   bool runOnFunction(Function &F) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetPassConfig>();
-  }
-
 private:
   bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
-                               SMEAttrs FnAttrs, const TargetLowering &TLI);
+                               SMEAttrs FnAttrs);
 };
 } // end anonymous namespace
 
@@ -60,16 +51,14 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
 //===----------------------------------------------------------------------===//
 
 // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0.
-void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
-                    bool ZT0IsUndef = false) {
+void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) {
   auto &Ctx = M->getContext();
   auto *TPIDR2SaveTy =
       FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
   auto Attrs =
       AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible");
-  RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE;
   FunctionCallee Callee =
-      M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs);
+      M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
   CallInst *Call = Builder.CreateCall(Callee);
 
   // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark
@@ -78,7 +67,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
   if (ZT0IsUndef)
     Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef"));
 
-  Call->setCallingConv(TLI.getLibcallCallingConv(LC));
+  Call->setCallingConv(
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0);
 
   // A save to TPIDR2 should be followed by clearing TPIDR2_EL0.
   Function *WriteIntr =
@@ -108,8 +98,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI,
 /// interface if it does not share ZA or ZT0.
 ///
 bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
-                                     IRBuilder<> &Builder, SMEAttrs FnAttrs,
-                                     const TargetLowering &TLI) {
+                                     IRBuilder<> &Builder, SMEAttrs FnAttrs) {
   LLVMContext &Context = F->getContext();
   BasicBlock *OrigBB = &F->getEntryBlock();
   Builder.SetInsertPoint(&OrigBB->front());
@@ -135,7 +124,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
 
     // Create a call __arm_tpidr2_save, which commits the lazy save.
     Builder.SetInsertPoint(&SaveBB->back());
-    emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
+    emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0());
 
     // Enable pstate.za at the start of the function.
     Builder.SetInsertPoint(&OrigBB->front());
@@ -183,14 +172,10 @@ bool SMEABI::runOnFunction(Function &F) {
   if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za"))
     return false;
 
-  const TargetMachine &TM =
-      getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-  const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
-
   bool Changed = false;
   SMEAttrs FnAttrs(F);
   if (FnAttrs.isNewZA() || FnAttrs.isNewZT0())
-    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI);
+    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 934f68b..271094f 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64SMEAttributes.h"
-#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/RuntimeLibcalls.h"
 #include <cassert>
 
 using namespace llvm;
@@ -79,36 +77,19 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
     Bitmask |= encodeZT0State(StateValue::New);
 }
 
-void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
-                                     const TargetLowering &TLI) {
-  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
-  if (Impl == RTLIB::Unsupported)
-    return;
-  RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) {
   unsigned KnownAttrs = SMEAttrs::Normal;
-  switch (LC) {
-  case RTLIB::SMEABI_SME_STATE:
-  case RTLIB::SMEABI_TPIDR2_SAVE:
-  case RTLIB::SMEABI_GET_CURRENT_VG:
-  case RTLIB::SMEABI_SME_STATE_SIZE:
-  case RTLIB::SMEABI_SME_SAVE:
-  case RTLIB::SMEABI_SME_RESTORE:
-    KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
-    break;
-  case RTLIB::SMEABI_ZA_DISABLE:
-  case RTLIB::SMEABI_TPIDR2_RESTORE:
+  if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
+    KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
+  if (FuncName == "__arm_tpidr2_restore")
     KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
                   SMEAttrs::SME_ABI_Routine;
-    break;
-  case RTLIB::SC_MEMCPY:
-  case RTLIB::SC_MEMMOVE:
-  case RTLIB::SC_MEMSET:
-  case RTLIB::SC_MEMCHR:
+  if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" ||
+      FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr")
     KnownAttrs |= SMEAttrs::SM_Compatible;
-    break;
-  default:
-    break;
-  }
+  if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" ||
+      FuncName == "__arm_sme_state_size")
+    KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine;
   set(KnownAttrs);
 }
 
@@ -129,11 +110,11 @@ bool SMECallAttrs::requiresSMChange() const {
   return true;
 }
 
-SMECallAttrs::SMECallAttrs(const CallBase &CB, const TargetLowering *TLI)
+SMECallAttrs::SMECallAttrs(const CallBase &CB)
     : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
       Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
   if (auto *CalledFunction = CB.getCalledFunction())
-    CalledFn = SMEAttrs(*CalledFunction, TLI);
+    CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes);
 
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index 06376c7..f1be0ecb 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -13,8 +13,6 @@
 
 namespace llvm {
 
-class TargetLowering;
-
 class Function;
 class CallBase;
 class AttributeList;
@@ -50,17 +48,17 @@ public:
     CallSiteFlags_Mask = ZT0_Undef
   };
 
+  enum class InferAttrsFromName { No, Yes };
+
   SMEAttrs() = default;
   SMEAttrs(unsigned Mask) { set(Mask); }
-  SMEAttrs(const Function &F, const TargetLowering *TLI = nullptr)
+  SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No)
       : SMEAttrs(F.getAttributes()) {
-    if (TLI)
-      addKnownFunctionAttrs(F.getName(), *TLI);
+    if (Infer == InferAttrsFromName::Yes)
+      addKnownFunctionAttrs(F.getName());
   }
   SMEAttrs(const AttributeList &L);
-  SMEAttrs(StringRef FuncName, const TargetLowering &TLI) {
-    addKnownFunctionAttrs(FuncName, TLI);
-  };
+  SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); };
 
   void set(unsigned M, bool Enable = true);
 
@@ -148,7 +146,7 @@ public:
   }
 
 private:
-  void addKnownFunctionAttrs(StringRef FuncName, const TargetLowering &TLI);
+  void addKnownFunctionAttrs(StringRef FuncName);
 };
 
 /// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has
@@ -165,7 +163,7 @@ public:
                SMEAttrs Callsite = SMEAttrs::Normal)
       : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
 
-  SMECallAttrs(const CallBase &CB, const TargetLowering *TLI);
+  SMECallAttrs(const CallBase &CB);
 
   SMEAttrs &caller() { return CallerFn; }
   SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481..0059a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
-struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
 void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingPreRAID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d92..6ddfa38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 5a6ad40..8c56c21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
 
   addRulesForGOpcs({G_PTR_ADD})
-      .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
-      .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
-      .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}})
-      .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}});
+      .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
+      .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
+      .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
+      .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
 
   addRulesForGOpcs({G_INTTOPTR})
       .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f1703..e393aa19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         if (Level == OptimizationLevel::O0)
           return;
 
-        PM.addPass(AMDGPUUnifyMetadataPass());
-
         // We don't want to run internalization at per-module stage.
         if (InternalizeSymbols && !isLTOPreLink(Phase)) {
           PM.addPass(InternalizePass(mustPreserveGV));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
deleted file mode 100644
index e400491..0000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This pass that unifies multiple OpenCL metadata due to linking.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
-  namespace kOCLMD {
-
-    const char SpirVer[]            = "opencl.spir.version";
-    const char OCLVer[]             = "opencl.ocl.version";
-    const char UsedExt[]            = "opencl.used.extensions";
-    const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
-    const char CompilerOptions[]    = "opencl.compiler.options";
-    const char LLVMIdent[]          = "llvm.ident";
-
-  } // end namespace kOCLMD
-
-    /// Unify version metadata.
-    /// \return true if changes are made.
-    /// Assume the named metadata has operands each of which is a pair of
-    /// integer constant, e.g.
-    /// !Name = {!n1, !n2}
-    /// !n1 = {i32 1, i32 2}
-    /// !n2 = {i32 2, i32 0}
-    /// Keep the largest version as the sole operand if PickFirst is false.
-    /// Otherwise pick it from the first value, representing kernel module.
-    bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
-      auto *NamedMD = M.getNamedMetadata(Name);
-      if (!NamedMD || NamedMD->getNumOperands() <= 1)
-        return false;
-      MDNode *MaxMD = nullptr;
-      auto MaxVer = 0U;
-      for (auto *VersionMD : NamedMD->operands()) {
-        assert(VersionMD->getNumOperands() == 2);
-        auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
-        auto VersionMajor = CMajor->getZExtValue();
-        auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
-        auto VersionMinor = CMinor->getZExtValue();
-        auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
-        if (Ver > MaxVer) {
-          MaxVer = Ver;
-          MaxMD = VersionMD;
-        }
-        if (PickFirst)
-          break;
-      }
-      NamedMD->eraseFromParent();
-      NamedMD = M.getOrInsertNamedMetadata(Name);
-      NamedMD->addOperand(MaxMD);
-      return true;
-    }
-
-  /// Unify version metadata.
-  /// \return true if changes are made.
-  /// Assume the named metadata has operands each of which is a list e.g.
-  /// !Name = {!n1, !n2}
-  /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
-  /// !n2 = !{!"cl_khr_image"}
-  /// Combine it into a single list with unique operands.
-  bool unifyExtensionMD(Module &M, StringRef Name) {
-    auto *NamedMD = M.getNamedMetadata(Name);
-    if (!NamedMD || NamedMD->getNumOperands() == 1)
-      return false;
-
-    SmallVector<Metadata *, 4> All;
-    for (auto *MD : NamedMD->operands())
-      for (const auto &Op : MD->operands())
-        if (!llvm::is_contained(All, Op.get()))
-          All.push_back(Op.get());
-
-    NamedMD->eraseFromParent();
-    NamedMD = M.getOrInsertNamedMetadata(Name);
-    for (const auto &MD : All)
-      NamedMD->addOperand(MDNode::get(M.getContext(), MD));
-
-    return true;
-  }
-
-  /// Unify multiple OpenCL metadata due to linking.
-  bool unifyMetadataImpl(Module &M) {
-    const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer};
-    const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat,
-                          kOCLMD::CompilerOptions, kOCLMD::LLVMIdent};
-
-    bool Changed = false;
-
-    for (auto &I : Vers)
-      Changed |= unifyVersionMD(M, I, true);
-
-    for (auto &I : Exts)
-      Changed |= unifyExtensionMD(M, I);
-
-    return Changed;
-  }
-
-  } // end anonymous namespace
-
-  PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M,
-                                                 ModuleAnalysisManager &AM) {
-    return unifyMetadataImpl(M) ? PreservedAnalyses::none()
-                                : PreservedAnalyses::all();
-  }
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9c..dc9dd22 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUWaitSGPRHazards.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
-  AMDGPUUnifyMetadata.cpp
   R600MachineCFGStructurizer.cpp
   GCNCreateVOPD.cpp
   GCNDPPCombine.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 2d0102f..7c01903 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -197,7 +197,7 @@ enum ClassFlags : unsigned {
 
 namespace AMDGPU {
 enum OperandType : unsigned {
-  /// Operands with register or 32-bit immediate
+  /// Operands with register, 32-bit, or 64-bit immediate
   OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
   OPERAND_REG_IMM_INT64,
   OPERAND_REG_IMM_INT16,
@@ -407,7 +407,7 @@ enum CPol {
 
   SCAL = 1 << 11, // Scale offset bit
 
-  ALL = TH | SCOPE,
+  ALL = TH | SCOPE | NV,
 
   // Helper bits
   TH_TYPE_LOAD = 1 << 7,    // TH_LOAD policy
@@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0].
   ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
   ID_GS_ALLOC_REQ = 9,       // added in GFX9
   ID_GET_DOORBELL = 10,      // added in GFX9, removed in GFX11
+  ID_SAVEWAVE_HAS_TDM = 10,  // added in GFX1250
   ID_GET_DDID = 11,          // added in GFX10, removed in GFX11
   ID_SYSMSG = 15,
 
@@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
   ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
+  ID_IB_STS2 = 28,
   ID_SHADER_CYCLES = 29,
   ID_SHADER_CYCLES_HI = 30,
   ID_DVGPR_ALLOC_LO = 31,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e76225..f58fde4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16894,6 +16894,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
 
   const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
+    // Check if we cannot determine the bit size of the given value type.  This
+    // can happen, for example, in this situation where we have an empty struct
+    // (size 0): `call void asm "", "v"({} poison)`-
+    if (VT == MVT::Other)
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     const unsigned BitWidth = VT.getSizeInBits();
     switch (Constraint[0]) {
     default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9278b85..c425d97 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2708,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                isModifierType<Src2VT>.ret,
                                HasOMod);
   field bit HasNeg = HasModifiers;
-  field bit HasMatrixReuse = 0;
   field bit HasMatrixFMT = 0;
   field bit HasMatrixScale = 0;
   field bit HasMatrixReuse = 0;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f3..f7a9a58 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -57,6 +57,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -76,6 +77,7 @@ private:
   LiveIntervals *LIS = nullptr;
   LiveVariables *LV = nullptr;
   MachineDominatorTree *MDT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
@@ -138,8 +140,8 @@ private:
 
 public:
   SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
-                     MachineDominatorTree *MDT)
-      : LIS(LIS), LV(LV), MDT(MDT) {}
+                     MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
+      : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
   bool run(MachineFunction &MF);
 };
 
@@ -159,6 +161,7 @@ public:
     AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     // Should preserve the same set that TwoAddressInstructions does.
     AU.addPreserved<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<LiveVariablesWrapperPass>();
@@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock *SplitBB = &MBB;
   if (NeedBlockSplit) {
     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
-    if (MDT && SplitBB != &MBB) {
-      MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
-      SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
-                                                 MBBNode->end());
-      MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
-      for (MachineDomTreeNode *Child : Children)
-        MDT->changeImmediateDominator(Child, SplitBBNode);
+    if (SplitBB != &MBB && (MDT || PDT)) {
+      using DomTreeT = DomTreeBase<MachineBasicBlock>;
+      SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+      for (MachineBasicBlock *Succ : SplitBB->successors()) {
+        DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+        DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+      }
+      DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+      if (MDT)
+        MDT->applyUpdates(DTUpdates);
+      if (PDT)
+        PDT->applyUpdates(DTUpdates);
     }
     Opcode = OrTermrOpc;
     InsPt = MI;
@@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   MachineBasicBlock *Succ = *MBB.succ_begin();
   MachineBasicBlock *FallThrough = nullptr;
 
+  using DomTreeT = DomTreeBase<MachineBasicBlock>;
+  SmallVector<DomTreeT::UpdateType, 8> DTUpdates;
+
   while (!MBB.predecessors().empty()) {
     MachineBasicBlock *P = *MBB.pred_begin();
     if (P->getFallThrough(false) == &MBB)
       FallThrough = P;
     P->ReplaceUsesOfBlockWith(&MBB, Succ);
+    DTUpdates.push_back({DomTreeT::Insert, P, Succ});
+    DTUpdates.push_back({DomTreeT::Delete, P, &MBB});
   }
   MBB.removeSuccessor(Succ);
   if (LIS) {
     for (auto &I : MBB.instrs())
       LIS->RemoveMachineInstrFromMaps(I);
   }
-  if (MDT) {
-    // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
-    // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
-    // be a leaf node in MDT and could be erased directly.
-    if (MDT->dominates(&MBB, Succ))
-      MDT->changeImmediateDominator(MDT->getNode(Succ),
-                                    MDT->getNode(&MBB)->getIDom());
-    MDT->eraseNode(&MBB);
-  }
+  if (MDT)
+    MDT->applyUpdates(DTUpdates);
+  if (PDT)
+    PDT->applyUpdates(DTUpdates);
+
   MBB.clear();
   MBB.eraseFromParent();
   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
   LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr;
   auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
   MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
-  return SILowerControlFlow(LIS, LV, MDT).run(MF);
+  auto *PDTWrapper =
+      getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
+  MachinePostDominatorTree *PDT =
+      PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
+  return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
 }
 
 PreservedAnalyses
@@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF,
   LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
   MachineDominatorTree *MDT =
       MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+  MachinePostDominatorTree *PDT =
+      MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
 
-  bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
+  bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
   if (!Changed)
     return PreservedAnalyses::all();
 
   auto PA = getMachineFunctionPassPreservedAnalyses();
   PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachinePostDominatorTreeAnalysis>();
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<LiveIntervalsAnalysis>();
   PA.preserve<LiveVariablesAnalysis>();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c2f4dbf..a003a46 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1665,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
-def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
+def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
+  let SubtargetPredicate = isNotGFX1250Plus;
+}
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 3d9455f..c740b5e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = {
   {{"MSG_GET_DDID"},            ID_GET_DDID,                isGFX10},
   {{"MSG_HS_TESSFACTOR"},       ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus},
   {{"MSG_DEALLOC_VGPRS"},       ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus},
-  {{""}},
+  {{"MSG_SAVEWAVE_HAS_TDM"},    ID_SAVEWAVE_HAS_TDM,        isGFX1250},
   {{"MSG_SYSMSG"},              ID_SYSMSG},
   {{"MSG_RTN_GET_DOORBELL"},    ID_RTN_GET_DOORBELL,        isGFX11Plus},
   {{"MSG_RTN_GET_DDID"},        ID_RTN_GET_DDID,            isGFX11Plus},
@@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_POPS_PACKER"},   ID_POPS_PACKER, isGFX10},
   {{""}},
   {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
-  {{""}},
+  {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
   {{"HW_REG_SHADER_CYCLES"},    ID_SHADER_CYCLES,    isGFX10_3_GFX11},
   {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
   {{"HW_REG_DVGPR_ALLOC_LO"},   ID_DVGPR_ALLOC_LO,   isGFX12Plus},
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d386c91..8ea567c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -587,167 +587,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     }
   }
 
-  // RTLIB
-  if (TM.isAAPCS_ABI() && (TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
-                           TT.isTargetMuslAEABI() || TT.isAndroid())) {
-    // FIXME: This does not depend on the subtarget and should go directly into
-    // RuntimeLibcalls. This is only here because of missing support for setting
-    // the calling convention of an implementation.
-    // clang-format off
-    static const struct {
-      const RTLIB::Libcall Op;
-      const RTLIB::LibcallImpl Impl;
-    } LibraryCalls[] = {
-      // Double-precision floating-point arithmetic helper functions
-      // RTABI chapter 4.1.2, Table 2
-      { RTLIB::ADD_F64, RTLIB::__aeabi_dadd },
-      { RTLIB::DIV_F64, RTLIB::__aeabi_ddiv },
-      { RTLIB::MUL_F64, RTLIB::__aeabi_dmul },
-      { RTLIB::SUB_F64, RTLIB::__aeabi_dsub },
-
-      // Double-precision floating-point comparison helper functions
-      // RTABI chapter 4.1.2, Table 3
-      { RTLIB::OEQ_F64, RTLIB::__aeabi_dcmpeq__oeq },
-      { RTLIB::UNE_F64, RTLIB::__aeabi_dcmpeq__une },
-      { RTLIB::OLT_F64, RTLIB::__aeabi_dcmplt },
-      { RTLIB::OLE_F64, RTLIB::__aeabi_dcmple },
-      { RTLIB::OGE_F64, RTLIB::__aeabi_dcmpge },
-      { RTLIB::OGT_F64, RTLIB::__aeabi_dcmpgt },
-      { RTLIB::UO_F64,  RTLIB::__aeabi_dcmpun },
-
-      // Single-precision floating-point arithmetic helper functions
-      // RTABI chapter 4.1.2, Table 4
-      { RTLIB::ADD_F32, RTLIB::__aeabi_fadd },
-      { RTLIB::DIV_F32, RTLIB::__aeabi_fdiv },
-      { RTLIB::MUL_F32, RTLIB::__aeabi_fmul },
-      { RTLIB::SUB_F32, RTLIB::__aeabi_fsub },
-
-      // Single-precision floating-point comparison helper functions
-      // RTABI chapter 4.1.2, Table 5
-      { RTLIB::OEQ_F32, RTLIB::__aeabi_fcmpeq__oeq },
-      { RTLIB::UNE_F32, RTLIB::__aeabi_fcmpeq__une },
-      { RTLIB::OLT_F32, RTLIB::__aeabi_fcmplt},
-      { RTLIB::OLE_F32, RTLIB::__aeabi_fcmple },
-      { RTLIB::OGE_F32, RTLIB::__aeabi_fcmpge },
-      { RTLIB::OGT_F32, RTLIB::__aeabi_fcmpgt },
-      { RTLIB::UO_F32,  RTLIB::__aeabi_fcmpun },
-
-      // Floating-point to integer conversions.
-      // RTABI chapter 4.1.2, Table 6
-      { RTLIB::FPTOSINT_F64_I32, RTLIB::__aeabi_d2iz },
-      { RTLIB::FPTOUINT_F64_I32, RTLIB::__aeabi_d2uiz },
-      { RTLIB::FPTOSINT_F64_I64, RTLIB::__aeabi_d2lz },
-      { RTLIB::FPTOUINT_F64_I64, RTLIB::__aeabi_d2ulz },
-      { RTLIB::FPTOSINT_F32_I32, RTLIB::__aeabi_f2iz },
-      { RTLIB::FPTOUINT_F32_I32, RTLIB::__aeabi_f2uiz },
-      { RTLIB::FPTOSINT_F32_I64, RTLIB::__aeabi_f2lz },
-      { RTLIB::FPTOUINT_F32_I64, RTLIB::__aeabi_f2ulz },
-
-      // Conversions between floating types.
-      // RTABI chapter 4.1.2, Table 7
-      { RTLIB::FPROUND_F64_F32, RTLIB::__aeabi_d2f },
-      { RTLIB::FPROUND_F64_F16, RTLIB::__aeabi_d2h },
-      { RTLIB::FPEXT_F32_F64,   RTLIB::__aeabi_f2d },
-
-      // Integer to floating-point conversions.
-      // RTABI chapter 4.1.2, Table 8
-      { RTLIB::SINTTOFP_I32_F64, RTLIB::__aeabi_i2d },
-      { RTLIB::UINTTOFP_I32_F64, RTLIB::__aeabi_ui2d },
-      { RTLIB::SINTTOFP_I64_F64, RTLIB::__aeabi_l2d },
-      { RTLIB::UINTTOFP_I64_F64, RTLIB::__aeabi_ul2d },
-      { RTLIB::SINTTOFP_I32_F32, RTLIB::__aeabi_i2f },
-      { RTLIB::UINTTOFP_I32_F32, RTLIB::__aeabi_ui2f },
-      { RTLIB::SINTTOFP_I64_F32, RTLIB::__aeabi_l2f },
-      { RTLIB::UINTTOFP_I64_F32, RTLIB::__aeabi_ul2f },
-
-      // Long long helper functions
-      // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, RTLIB::__aeabi_lmul },
-      { RTLIB::SHL_I64, RTLIB::__aeabi_llsl },
-      { RTLIB::SRL_I64, RTLIB::__aeabi_llsr },
-      { RTLIB::SRA_I64, RTLIB::__aeabi_lasr },
-
-      // Integer division functions
-      // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I32, RTLIB::__aeabi_idiv },
-      { RTLIB::SDIV_I64, RTLIB::__aeabi_ldivmod },
-      { RTLIB::UDIV_I32, RTLIB::__aeabi_uidiv },
-      { RTLIB::UDIV_I64, RTLIB::__aeabi_uldivmod },
-    };
-    // clang-format on
-
-    for (const auto &LC : LibraryCalls)
-      setLibcallImpl(LC.Op, LC.Impl);
-
-    // EABI dependent RTLIB
-    if (TM.Options.EABIVersion == EABI::EABI4 ||
-        TM.Options.EABIVersion == EABI::EABI5) {
-      static const struct {
-        const RTLIB::Libcall Op;
-        const RTLIB::LibcallImpl Impl;
-      } MemOpsLibraryCalls[] = {
-          // Memory operations
-          // RTABI chapter 4.3.4
-          {RTLIB::MEMCPY, RTLIB::__aeabi_memcpy},
-          {RTLIB::MEMMOVE, RTLIB::__aeabi_memmove},
-          {RTLIB::MEMSET, RTLIB::__aeabi_memset},
-          {RTLIB::AEABI_MEMCPY4, RTLIB::__aeabi_memcpy4},
-          {RTLIB::AEABI_MEMCPY8, RTLIB::__aeabi_memcpy8},
-          {RTLIB::AEABI_MEMMOVE4, RTLIB::__aeabi_memmove4},
-          {RTLIB::AEABI_MEMMOVE8, RTLIB::__aeabi_memmove8},
-          {RTLIB::AEABI_MEMSET4, RTLIB::__aeabi_memset4},
-          {RTLIB::AEABI_MEMSET8, RTLIB::__aeabi_memset8},
-          {RTLIB::AEABI_MEMCLR, RTLIB::__aeabi_memclr},
-          {RTLIB::AEABI_MEMCLR4, RTLIB::__aeabi_memclr4},
-          {RTLIB::AEABI_MEMCLR8, RTLIB::__aeabi_memclr8},
-      };
-
-      for (const auto &LC : MemOpsLibraryCalls)
-        setLibcallImpl(LC.Op, LC.Impl);
-    }
-  }
-
-  // The half <-> float conversion functions are always soft-float on
-  // non-watchos platforms, but are needed for some targets which use a
-  // hard-float calling convention by default.
-  if (!TT.isWatchABI()) {
-    if (TM.isAAPCS_ABI()) {
-      setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_AAPCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_AAPCS);
-    } else {
-      setLibcallImplCallingConv(RTLIB::__truncsfhf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__truncdfhf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__extendhfsf2, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_h2f_ieee, CallingConv::ARM_APCS);
-      setLibcallImplCallingConv(RTLIB::__gnu_f2h_ieee, CallingConv::ARM_APCS);
-    }
-  }
-
-  // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
-  // a __gnu_ prefix (which is the default).
-  if (TT.isTargetAEABI()) {
-    // FIXME: This does not depend on the subtarget and should go directly into
-    // RuntimeLibcalls. This is only here because of missing support for setting
-    // the calling convention of an implementation.
-    static const struct {
-      const RTLIB::Libcall Op;
-      const RTLIB::LibcallImpl Impl;
-    } LibraryCalls[] = {
-        {RTLIB::FPROUND_F32_F16, RTLIB::__aeabi_f2h},
-        {RTLIB::FPEXT_F16_F32, RTLIB::__aeabi_h2f},
-    };
-
-    for (const auto &LC : LibraryCalls) {
-      setLibcallImpl(LC.Op, LC.Impl);
-    }
-  } else if (!TT.isOSBinFormatMachO()) {
-    setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__gnu_f2h_ieee);
-    setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__gnu_h2f_ieee);
-  }
-
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -7406,7 +7245,7 @@ static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   // If the mask is twice as long as the input vector then we need to check the
@@ -7438,7 +7277,7 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7541,7 +7380,7 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
@@ -7574,7 +7413,7 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  if (M.size() != NumElts && M.size() != NumElts*2)
+  if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 393cf2d..6b28541 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1084,9 +1084,10 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(
                                               CostKind, Op1Info, Op2Info, I);
 }
 
-InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy,
-                                                      ScalarEvolution *SE,
-                                                      const SCEV *Ptr) const {
+InstructionCost
+ARMTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                      const SCEV *Ptr,
+                                      TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -1103,7 +1104,7 @@ InstructionCost ARMTTIImpl::getAddressComputationCost(Type *PtrTy,
     // addressing mode.
     return 1;
   }
-  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr);
+  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
 }
 
 bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) const {
@@ -1335,6 +1336,39 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
     if (!Mask.empty()) {
       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
+      // Check for LD2/LD4 instructions, which are represented in llvm IR as
+      // deinterleaving-shuffle(load). The shuffle cost could potentially be
+      // free, but we model it with a cost of LT.first so that LD2/LD4 have a
+      // higher cost than just the load.
+      if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
+          (LT.second.getScalarSizeInBits() == 8 ||
+           LT.second.getScalarSizeInBits() == 16 ||
+           LT.second.getScalarSizeInBits() == 32) &&
+          LT.second.getSizeInBits() == 128 &&
+          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2)) ||
+           (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+            ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))))
+        return ST->getMVEVectorCostFactor(CostKind) *
+               std::max<InstructionCost>(1, LT.first / 4);
+
+      // Check for ST2/ST4 instructions, which are represented in llvm IR as
+      // store(interleaving-shuffle). The shuffle cost could potentially be
+      // free, but we model it with a cost of LT.first so that ST2/ST4 have a
+      // higher cost than just the store.
+      if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
+          (LT.second.getScalarSizeInBits() == 8 ||
+           LT.second.getScalarSizeInBits() == 16 ||
+           LT.second.getScalarSizeInBits() == 32) &&
+          LT.second.getSizeInBits() == 128 &&
+          ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
+            ShuffleVectorInst::isInterleaveMask(
+                Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
+           (TLI->getMaxSupportedInterleaveFactor() == 4 &&
+            ShuffleVectorInst::isInterleaveMask(
+                Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
+        return ST->getMVEVectorCostFactor(CostKind) * LT.first;
+
       if (LT.second.isVector() &&
           Mask.size() <= LT.second.getVectorNumElements() &&
           (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 522c235..cdd8bcb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -257,8 +257,9 @@ public:
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
-  InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ece6c10..0e97483 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3373,12 +3373,12 @@ public:
 
   void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+    Inst.addOperand(MCOperand::createImm(getMSRMask()));
   }
 
   void addBankedRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+    Inst.addOperand(MCOperand::createImm(getBankedReg()));
   }
 
   void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 5c21281..171e294 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -156,9 +156,10 @@ HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
-InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *PtrTy,
-                                                          ScalarEvolution *SE,
-                                                          const SCEV *S) const {
+InstructionCost
+HexagonTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                          const SCEV *S,
+                                          TTI::TargetCostKind CostKind) const {
   return 0;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 0a5766d..dbf16c9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -111,8 +111,9 @@ public:
   InstructionCost
   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                         TTI::TargetCostKind CostKind) const override;
-  InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                            const SCEV *S) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *S,
+                            TTI::TargetCostKind CostKind) const override;
   InstructionCost getMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 1447241..a2a41d0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2786,7 +2786,7 @@ SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op,
   EVT RetVT = Op.getValueType();
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(OpVT, RetVT);
   MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
   SDValue Chain = SDValue();
   SDValue Result;
   std::tie(Result, Chain) =
@@ -2811,7 +2811,7 @@ SDValue LoongArchTargetLowering::lowerSINT_TO_FP(SDValue Op,
   EVT RetVT = Op.getValueType();
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(OpVT, RetVT);
   MakeLibCallOptions CallOptions;
-  CallOptions.setTypeListBeforeSoften(OpVT, RetVT, true);
+  CallOptions.setTypeListBeforeSoften(OpVT, RetVT);
   SDValue Chain = SDValue();
   SDValue Result;
   std::tie(Result, Chain) =
@@ -4107,7 +4107,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
     LC = RTLIB::getFPTOSINT(Src.getValueType(), VT);
     MakeLibCallOptions CallOptions;
     EVT OpVT = Src.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, VT, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, VT);
     SDValue Chain = SDValue();
     SDValue Result;
     std::tie(Result, Chain) =
@@ -4360,7 +4360,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
     RTLIB::Libcall LC =
         OpVT == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
     MakeLibCallOptions CallOptions;
-    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
     SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
     Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
     Results.push_back(Result);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index d8bb16f..0696b11 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1640,6 +1640,24 @@ defm : PairInsertExtractPatV8<v8f32, f32>;
 defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
 defm : PairInsertExtractPatV4<v4f64, f64>;
 
+def : Pat<(vector_insert v8i32:$xd, (GRLenVT(vector_extract v8i32:$xj, 0)),
+              uimm3:$imm),
+          (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+
+def : Pat<(vector_insert v4i64:$xd, (GRLenVT(vector_extract v4i64:$xj, 0)),
+              uimm2:$imm),
+          (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+
+def : Pat<(vector_insert v8i32:$xd,
+              (GRLenVT(vector_extract v8i32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSVE0_W v8i32:$xd, (XVPICKVE_W v8i32:$xj, uimm3:$imm1),
+              uimm3:$imm2)>;
+
+def : Pat<(vector_insert v4i64:$xd,
+              (GRLenVT(vector_extract v4i64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSVE0_D v4i64:$xd, (XVPICKVE_D v4i64:$xj, uimm2:$imm1),
+              uimm2:$imm2)>;
+
 // PseudoXVINSGR2VR_{B/H}
 def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
           (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 18aeda6..2445005 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
-  return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
+  return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
@@ -82,11 +82,6 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
   return TL->allowFMA(*MF, OptLevel);
 }
 
-bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
-  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
-  return TL->allowUnsafeFPMath(*MF);
-}
-
 bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
 
 /// Select - Select instructions not customized! Used for
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 357e915f..6573172 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool usePrecSqrtF32(const SDNode *N) const;
   bool useF32FTZ() const;
   bool allowFMA() const;
-  bool allowUnsafeFPMath() const;
   bool doRsqrtOpt() const;
 
   NVPTXScopes Scopes{};
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3daf25d..b94cbd0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -125,10 +125,6 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   if (UsePrecDivF32.getNumOccurrences() > 0)
     return UsePrecDivF32;
 
-  // Otherwise, use div.approx if fast math is enabled
-  if (allowUnsafeFPMath(MF))
-    return NVPTX::DivPrecisionLevel::Approx;
-
   const SDNodeFlags Flags = N.getFlags();
   if (Flags.hasApproximateFuncs())
     return NVPTX::DivPrecisionLevel::Approx;
@@ -136,16 +132,11 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   return NVPTX::DivPrecisionLevel::IEEE754;
 }
 
-bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
-                                         const SDNode *N) const {
+bool NVPTXTargetLowering::usePrecSqrtF32(const SDNode *N) const {
   // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
   if (UsePrecSqrtF32.getNumOccurrences() > 0)
     return UsePrecSqrtF32;
 
-  // Otherwise, use sqrt.approx if fast math is enabled
-  if (allowUnsafeFPMath(MF))
-    return false;
-
   if (N) {
     const SDNodeFlags Flags = N->getFlags();
     if (Flags.hasApproximateFuncs())
@@ -1193,8 +1184,7 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                              bool &UseOneConst,
                                              bool Reciprocal) const {
   if (!(Enabled == ReciprocalEstimate::Enabled ||
-        (Enabled == ReciprocalEstimate::Unspecified &&
-         !usePrecSqrtF32(DAG.getMachineFunction()))))
+        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
     return SDValue();
 
   if (ExtraSteps == ReciprocalEstimate::Unspecified)
@@ -2851,8 +2841,7 @@ static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
                      SDLoc(Op), Opcode, DAG);
 }
 
-static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
-                         bool AllowUnsafeFPMath) {
+static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG) {
   // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
   // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
   // the semantics of LLVM's frem.
@@ -2869,7 +2858,7 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
   SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
                             Flags | SDNodeFlags::AllowContract);
 
-  if (AllowUnsafeFPMath || Flags.hasNoInfs())
+  if (Flags.hasNoInfs())
     return Sub;
 
   // If Y is infinite, return X
@@ -3014,7 +3003,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:
     return lowerCTLZCTPOP(Op, DAG);
   case ISD::FREM:
-    return lowerFREM(Op, DAG, allowUnsafeFPMath(DAG.getMachineFunction()));
+    return lowerFREM(Op, DAG);
 
   default:
     llvm_unreachable("Custom lowering not defined for operation");
@@ -4868,17 +4857,7 @@ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
     return true;
 
-  return allowUnsafeFPMath(MF);
-}
-
-bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const {
-  // Honor TargetOptions flags that explicitly say unsafe math is okay.
-  if (MF.getTarget().Options.UnsafeFPMath)
-    return true;
-
-  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
-  const Function &F = MF.getFunction();
-  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+  return false;
 }
 
 static bool isConstZero(const SDValue &Operand) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 43e721a..27f099e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -206,8 +206,7 @@ public:
 
   // Get whether we should use a precise or approximate 32-bit floating point
   // sqrt instruction.
-  bool usePrecSqrtF32(const MachineFunction &MF,
-                      const SDNode *N = nullptr) const;
+  bool usePrecSqrtF32(const SDNode *N = nullptr) const;
 
   // Get whether we should use instructions that flush floating-point denormals
   // to sign-preserving zero.
@@ -220,7 +219,6 @@ public:
   unsigned combineRepeatedFPDivisors() const override { return 2; }
 
   bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const;
-  bool allowUnsafeFPMath(const MachineFunction &MF) const;
 
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT) const override {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index bd54d1d..ebb5e32 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1133,9 +1133,8 @@ defm FMA_F64    : FMA<F64RT,    allow_ftz = false>;
 // sin/cos/tanh
 
 class UnaryOpAllowsApproxFn<SDPatternOperator operator>
-    : PatFrag<(ops node:$A),
-              (operator node:$A), [{
-  return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs();
+    : PatFrag<(ops node:$A), (operator node:$A), [{
+  return N->getFlags().hasApproximateFuncs();
 }]>;
 
 def SIN_APPROX_f32 :
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5998653..9e1530a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
 #include "RISCVInstrInfo.h"
 #include "RISCVSelectionDAGInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Debug.h"
@@ -772,6 +773,49 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
   return false;
 }
 
+// (xor X, (and (xor X, C1), C2))
+// -> (qc.insbi X, (C1 >> ShAmt), Width, ShAmt)
+// where C2 is a shifted mask with width=Width and shift=ShAmt
+bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromXor(SDNode *Node) {
+
+  if (!Subtarget->hasVendorXqcibm())
+    return false;
+
+  using namespace SDPatternMatch;
+
+  SDValue X;
+  APInt CImm, CMask;
+  if (!sd_match(
+          Node,
+          m_Xor(m_Value(X),
+                m_OneUse(m_And(m_OneUse(m_Xor(m_Deferred(X), m_ConstInt(CImm))),
+                               m_ConstInt(CMask))))))
+    return false;
+
+  unsigned Width, ShAmt;
+  if (!CMask.isShiftedMask(ShAmt, Width))
+    return false;
+
+  int64_t Imm = CImm.getSExtValue();
+  Imm >>= ShAmt;
+
+  SDLoc DL(Node);
+  SDValue ImmNode;
+  auto Opc = RISCV::QC_INSB;
+
+  if (isInt<5>(Imm)) {
+    Opc = RISCV::QC_INSBI;
+    ImmNode = CurDAG->getSignedTargetConstant(Imm, DL, MVT::i32);
+  } else {
+    ImmNode = selectImm(CurDAG, DL, MVT::i32, Imm, *Subtarget);
+  }
+  SDValue Ops[] = {X, ImmNode, CurDAG->getTargetConstant(Width, DL, MVT::i32),
+                   CurDAG->getTargetConstant(ShAmt, DL, MVT::i32)};
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, MVT::i32, Ops));
+
+  return true;
+}
+
 bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
                                                    const SDLoc &DL, MVT VT,
                                                    SDValue X, unsigned Msb,
@@ -1349,6 +1393,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (tryShrinkShlLogicImm(Node))
       return;
 
+    if (tryBitfieldInsertOpFromXor(Node))
+      return;
+
     break;
   case ISD::AND: {
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index ee3a86e..9d4cd0e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -75,6 +75,7 @@ public:
   bool trySignedBitfieldExtract(SDNode *Node);
   bool trySignedBitfieldInsertInSign(SDNode *Node);
   bool trySignedBitfieldInsertInMask(SDNode *Node);
+  bool tryBitfieldInsertOpFromXor(SDNode *Node);
   bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
                                   SDValue X, unsigned Msb, unsigned Lsb);
   bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8bc42ad..4f52f68 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -14333,7 +14333,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
     MakeLibCallOptions CallOptions;
     EVT OpVT = Op0.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+    CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0));
     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
     SDValue Result;
     std::tie(Result, Chain) =
@@ -14368,7 +14368,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         Op0.getValueType() == MVT::f64 ? RTLIB::LROUND_F64 : RTLIB::LROUND_F32;
     MakeLibCallOptions CallOptions;
     EVT OpVT = Op0.getValueType();
-    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64, true);
+    CallOptions.setTypeListBeforeSoften(OpVT, MVT::i64);
     SDValue Result = makeLibCall(DAG, LC, MVT::i64, Op0, CallOptions, DL).first;
     Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Result);
     Results.push_back(Result);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8bd3830..2a34a24 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1694,6 +1694,20 @@ multiclass SelectCC_GPR_riirr<DAGOperand valty, DAGOperand imm> {
                 valty:$truev, valty:$falsev), []>;
 }
 
+let Predicates = [IsRV32] in {
+def : Pat<(i32 (seteq (i32 (and GPR:$rs1, 0xffffffff80000000)), 0)),
+          (XORI (i32 (SRLI GPR:$rs1, 31)), 1)>;
+def : Pat<(i32 (setlt (i32 GPR:$rs1), 0)), (SRLI GPR:$rs1, 31)>; // compressible
+}
+let Predicates = [IsRV64] in {
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x8000000000000000)), 0)),
+          (XORI (i64 (SRLI GPR:$rs1, 63)), 1)>;
+def : Pat<(i64 (seteq (i64 (and GPR:$rs1, 0x0000000080000000)), 0)),
+          (XORI (i64 (SRLIW GPR:$rs1, 31)), 1)>;
+def : Pat<(i64 (setlt (i64 GPR:$rs1), 0)), (SRLI GPR:$rs1, 63)>; // compressible
+def : Pat<(i64 (setlt (sext_inreg GPR:$rs1, i32), 0)), (SRLIW GPR:$rs1, 31)>;
+}
+
 /// Branches and jumps
 
 // Match `riscv_brcc` and lower to the appropriate RISC-V branch instruction.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 8297d50..d17330f9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -98,6 +98,14 @@ class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType>
   let Inst{27}    = 0b0;
 }
 
+class RVPShiftD_ri<bits<3> f, bits<3> funct3, string opcodestr>
+    : RVPShift_ri<f, funct3, opcodestr, uimm6> {
+  bits<6> shamt;
+
+  let Inst{26} = 0b1;
+  let Inst{25-20} = shamt;
+}
+
 class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr>
     : RVPShift_ri<f, funct3, opcodestr, uimm5> {
   bits<5> shamt;
@@ -136,34 +144,36 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtP] in {
-let IsSignExtendingOpW = 1 in
-def CLS    : Unary_r<0b011000000011, 0b001, "cls">;
-def ABS    : Unary_r<0b011000000111, 0b001, "abs">;
+  let IsSignExtendingOpW = 1 in
+  def CLS    : Unary_r<0b011000000011, 0b001, "cls">;
+  def ABS    : Unary_r<0b011000000111, 0b001, "abs">;
 } // Predicates = [HasStdExtP]
-let Predicates = [HasStdExtP, IsRV32] in
-def REV_RV32  : Unary_r<0b011010011111, 0b101, "rev">;
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  def REV_RV32  : Unary_r<0b011010011111, 0b101, "rev">;
+} // Predicates = [HasStdExtP, IsRV32]
 
 let Predicates = [HasStdExtP, IsRV64] in {
-def REV16      : Unary_r<0b011010110000, 0b101, "rev16">;
-def REV_RV64   : Unary_r<0b011010111111, 0b101, "rev">;
+  def REV16      : Unary_r<0b011010110000, 0b101, "rev16">;
+  def REV_RV64   : Unary_r<0b011010111111, 0b101, "rev">;
 
-let IsSignExtendingOpW = 1 in {
-def CLSW  : UnaryW_r<0b011000000011, 0b001, "clsw">;
-def ABSW  : UnaryW_r<0b011000000111, 0b001, "absw">;
-}
+  let IsSignExtendingOpW = 1 in {
+    def CLSW  : UnaryW_r<0b011000000011, 0b001, "clsw">;
+    def ABSW  : UnaryW_r<0b011000000111, 0b001, "absw">;
+  }
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in {
-def PSLLI_B  : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
-def PSLLI_H  : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
-def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
+  def PSLLI_B  : RVPShiftB_ri<0b000, 0b010, "pslli.b">;
+  def PSLLI_H  : RVPShiftH_ri<0b000, 0b010, "pslli.h">;
+  def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">;
 } // Predicates = [HasStdExtP]
-let DecoderNamespace = "RV32Only",
-    Predicates = [HasStdExtP, IsRV32] in
-def SSLAI    : RVPShiftW_ri<0b101, 0b010, "sslai">;
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def SSLAI    : RVPShiftW_ri<0b101, 0b010, "sslai">;
+} // Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only"
 let Predicates = [HasStdExtP, IsRV64] in {
-def PSLLI_W  : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
-def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
+  def PSLLI_W  : RVPShiftW_ri<0b000, 0b010, "pslli.w">;
+  def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
@@ -174,16 +184,50 @@ let Predicates = [HasStdExtP] in
 def PLI_B : PLI_B_i<0b10110100, "pli.b">;
 
 let Predicates = [HasStdExtP] in {
-def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
-def PSABS_H   : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
-def PSABS_B   : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
+  def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">;
+  def PSABS_H   : RVPUnary_ri<0b00, 0b00111, "psabs.h">;
+  def PSABS_B   : RVPUnary_ri<0b10, 0b00111, "psabs.b">;
 } // Predicates = [HasStdExtP]
 let Predicates = [HasStdExtP, IsRV64] in {
-def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
-def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
+  def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">;
+  def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
 def PLUI_H : PLUI_i<0b1111000, "plui.h">;
 let Predicates = [HasStdExtP, IsRV64] in
 def PLUI_W : PLUI_i<0b1111001, "plui.w">;
+
+let Predicates = [HasStdExtP] in {
+  def PSRLI_B    : RVPShiftB_ri<0b000, 0b100, "psrli.b">;
+  def PSRLI_H    : RVPShiftH_ri<0b000, 0b100, "psrli.h">;
+
+  def PUSATI_H   : RVPShiftH_ri<0b010, 0b100, "pusati.h">;
+
+  def PSRAI_B    : RVPShiftB_ri<0b100, 0b100, "psrai.b">;
+  def PSRAI_H    : RVPShiftH_ri<0b100, 0b100, "psrai.h">;
+
+  def PSRARI_H   : RVPShiftH_ri<0b101, 0b100, "psrari.h">;
+
+  def PSATI_H    : RVPShiftH_ri<0b110, 0b100, "psati.h">;
+} // Predicates = [HasStdExtP]
+let Predicates = [HasStdExtP, IsRV32], DecoderNamespace = "RV32Only" in {
+  def USATI_RV32 : RVPShiftW_ri<0b010, 0b100, "usati">;
+
+  def SRARI_RV32 : RVPShiftW_ri<0b101, 0b100, "srari">;
+
+  def SATI_RV32  : RVPShiftW_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV32]
+let Predicates = [HasStdExtP, IsRV64] in {
+  def PSRLI_W    : RVPShiftW_ri<0b000, 0b100, "psrli.w">;
+  def PSRAI_W    : RVPShiftW_ri<0b100, 0b100, "psrai.w">;
+
+  def PUSATI_W   : RVPShiftW_ri<0b010, 0b100, "pusati.w">;
+  def USATI_RV64 : RVPShiftD_ri<0b010, 0b100, "usati">;
+
+  def PSRARI_W   : RVPShiftW_ri<0b101, 0b100, "psrari.w">;
+  def SRARI_RV64 : RVPShiftD_ri<0b101, 0b100, "srari">;
+
+  def PSATI_W    : RVPShiftW_ri<0b110, 0b100, "psati.w">;
+  def SATI_RV64  : RVPShiftD_ri<0b110, 0b100, "sati">;
+} // Predicates = [HasStdExtP, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
index 5ef858a..8cf15fa 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td
@@ -24,7 +24,7 @@ let SchedModel = Andes45Model in {
 
 //===----------------------------------------------------------------------===//
 // Andes 45 series CPU
-//   - 2 Interger Arithmetic and Logical Units (ALU)
+//   - 2 Integer Arithmetic and Logical Units (ALU)
 //   - Multiply / Divide Unit (MDU)
 //   - Load Store Unit (LSU)
 //   - Control and Status Register Unit (CSR)
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 28c8f40..f013898 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -497,6 +497,10 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VANDN_VX:
   // Vector Reverse Bits in Elements
   case RISCV::VBREV_V:
+  // Vector Reverse Bits in Bytes
+  case RISCV::VBREV8_V:
+  // Vector Reverse Bytes
+  case RISCV::VREV8_V:
   // Vector Count Leading Zeros
   case RISCV::VCLZ_V:
   // Vector Count Trailing Zeros
@@ -510,6 +514,13 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   case RISCV::VROR_VI:
   case RISCV::VROR_VV:
   case RISCV::VROR_VX:
+  // Vector Carry-less Multiplication Instructions (Zvbc)
+  // Vector Carry-less Multiply
+  case RISCV::VCLMUL_VV:
+  case RISCV::VCLMUL_VX:
+  // Vector Carry-less Multiply Return High Half
+  case RISCV::VCLMULH_VV:
+  case RISCV::VCLMULH_VX:
     return MILog2SEW;
 
   // Vector Widening Shift Left Logical (Zvbb)
@@ -1046,6 +1057,10 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VANDN_VX:
   // Vector Reverse Bits in Elements
   case RISCV::VBREV_V:
+  // Vector Reverse Bits in Bytes
+  case RISCV::VBREV8_V:
+  // Vector Reverse Bytes
+  case RISCV::VREV8_V:
   // Vector Count Leading Zeros
   case RISCV::VCLZ_V:
   // Vector Count Trailing Zeros
@@ -1063,6 +1078,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VWSLL_VI:
   case RISCV::VWSLL_VX:
   case RISCV::VWSLL_VV:
+  // Vector Carry-less Multiplication Instructions (Zvbc)
+  // Vector Carry-less Multiply
+  case RISCV::VCLMUL_VV:
+  case RISCV::VCLMUL_VX:
+  // Vector Carry-less Multiply Return High Half
+  case RISCV::VCLMULH_VV:
+  case RISCV::VCLMULH_VX:
   // Vector Mask Instructions
   // Vector Mask-Register Logical Instructions
   // vmsbf.m set-before-first mask bit
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f366094..97cdf5b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15419,18 +15419,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
         return SDValue();
     }
 
-    // Avoid returning the same shuffle operation. For example,
-    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
-    //                             undef:v16i16
-    if (CrossLaneMask == Mask || InLaneMask == Mask)
-      return SDValue();
-
     // Simplify CrossLaneMask based on the actual demanded elements.
     if (V1.hasOneUse())
       for (int i = 0; i != NumElts; ++i)
         if (!DemandedCrossLane[i])
           CrossLaneMask[i] = SM_SentinelUndef;
 
+    // Avoid returning the same shuffle operation. For example,
+    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+    //                             undef:v16i16
+    if (CrossLaneMask == Mask || InLaneMask == Mask)
+      return SDValue();
+
     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
                                 InLaneMask);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9ef21fa..cae6bb9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5488,9 +5488,10 @@ InstructionCost X86TTIImpl::getPointersChainCost(
   return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
 }
 
-InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy,
-                                                      ScalarEvolution *SE,
-                                                      const SCEV *Ptr) const {
+InstructionCost
+X86TTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                      const SCEV *Ptr,
+                                      TTI::TargetCostKind CostKind) const {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
@@ -5513,7 +5514,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *PtrTy,
       return 1;
   }
 
-  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr);
+  return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c47..5718c0c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -194,8 +194,9 @@ public:
   getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
                        const TTI::PointersChainInfo &Info, Type *AccessTy,
                        TTI::TargetCostKind CostKind) const override;
-  InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
-                                            const SCEV *Ptr) const override;
+  InstructionCost
+  getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
+                            TTI::TargetCostKind CostKind) const override;
 
   std::optional<Instruction *>
   instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index ab906f9..180ac9c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2252,6 +2252,10 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
       UR.CWorklist.insert(CurrentSCC);
       for (Function *Clone : Clones)
         UR.CWorklist.insert(CG.lookupSCC(CG.get(*Clone)));
+    } else if (Shape.ABI == coro::ABI::Async) {
+      // Reprocess the function to inline the tail called return function of
+      // coro.async.end.
+      UR.CWorklist.insert(&C);
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cf94d28..a64f422 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1320,6 +1320,35 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
   return nullptr;
 }
 
+/// Fold icmp eq (num + mask) & ~mask, num
+///      to
+///      icmp eq (and num, mask), 0
+/// Where mask is a low bit mask.
+Instruction *InstCombinerImpl::foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp) {
+  Value *Num;
+  CmpPredicate Pred;
+  const APInt *Mask, *Neg;
+
+  if (!match(&Cmp,
+             m_c_ICmp(Pred, m_Value(Num),
+                      m_OneUse(m_c_And(m_OneUse(m_c_Add(m_Deferred(Num),
+                                                        m_LowBitMask(Mask))),
+                                       m_APInt(Neg))))))
+    return nullptr;
+
+  if (*Neg != ~*Mask)
+    return nullptr;
+
+  if (!ICmpInst::isEquality(Pred))
+    return nullptr;
+
+  // Create new icmp eq (num & mask), 0
+  auto *NewAnd = Builder.CreateAnd(Num, *Mask);
+  auto *Zero = Constant::getNullValue(Num->getType());
+
+  return new ICmpInst(Pred, NewAnd, Zero);
+}
+
 /// Fold icmp Pred X, C.
 /// TODO: This code structure does not make sense. The saturating add fold
 /// should be moved to some other helper and extended as noted below (it is also
@@ -7644,6 +7673,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpUsingKnownBits(I))
     return Res;
 
+  if (Instruction *Res = foldIsMultipleOfAPowerOfTwo(I))
+    return Res;
+
   // Test if the ICmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index c67e27e..2340028 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -721,6 +721,7 @@ public:
   Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
   Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
   Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+  Instruction *foldIsMultipleOfAPowerOfTwo(ICmpInst &Cmp);
   Instruction *foldICmpUsingBoolRange(ICmpInst &I);
   Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
   Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 1ef4fcc..4c035a2 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2867,7 +2867,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   bool LVInRHS = L.isLoopInvariant(BO->getOperand(0));
   auto *BO0 = dyn_cast<BinaryOperator>(BO->getOperand(LVInRHS));
   if (!BO0 || BO0->getOpcode() != Opcode || !BO0->isAssociative() ||
-      BO0->hasNUsesOrMore(3))
+      BO0->hasNUsesOrMore(BO0->getType()->isIntegerTy() ? 2 : 3))
     return false;
 
   Value *LV = BO0->getOperand(0);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 844219a..8b15445 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2309,7 +2309,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
-      Cost += TTI.getAddressComputationCost(GEP->getType());
+      Cost += TTI.getAddressComputationCost(
+          GEP->getType(), nullptr, nullptr,
+          TargetTransformInfo::TCK_SizeAndLatency);
 
       // And cost of the GEP itself
       // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6ffe841..fc96589 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -294,6 +294,10 @@ private:
   bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
                     bool NonNegative);
 
+  /// Analyze XOR instruction to extract disjoint constant bits that behave
+  /// like addition operations for improved address mode folding.
+  APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
+
   /// The path from the constant offset to the old GEP index. e.g., if the GEP
   /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
   /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@@ -596,6 +600,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
     // Trace into subexpressions for more hoisting opportunities.
     if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
       ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+    // Handle XOR with disjoint bits that can be treated as addition.
+    else if (BO->getOpcode() == Instruction::Xor)
+      ConstantOffset = extractDisjointBitsFromXor(BO);
   } else if (isa<TruncInst>(V)) {
     ConstantOffset =
         find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -708,11 +715,20 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   Value *NextInChain = removeConstOffset(ChainIndex - 1);
   Value *TheOther = BO->getOperand(1 - OpNo);
 
-  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
-  // sub-expression to be just TheOther.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
-    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
-      return TheOther;
+    if (CI->isZero()) {
+      // Custom XOR handling for disjoint bits - preserves original XOR
+      // with non-disjoint constant bits.
+      // TODO: The design should be updated to support partial constant
+      // extraction.
+      if (BO->getOpcode() == Instruction::Xor)
+        return BO;
+
+      // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+      // sub-expression to be just TheOther.
+      if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+        return TheOther;
+    }
   }
 
   BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@@ -743,6 +759,67 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   return NewBO;
 }
 
+/// Analyze XOR instruction to extract disjoint constant bits for address
+/// folding
+///
+/// This function identifies bits in an XOR constant operand that are disjoint
+/// from the base operand's known set bits. For these disjoint bits, XOR behaves
+/// identically to addition, allowing us to extract them as constant offsets
+/// that can be folded into addressing modes.
+///
+/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
+/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
+///
+/// Example with ptr having known-zero low bit:
+///   Original: `xor %ptr, 3`    ; 3 = 0b11
+///   Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
+///   Result:   `(xor %ptr, 2) + 1` where 1 can be folded into address mode
+///
+/// \param XorInst The XOR binary operator to analyze
+/// \return APInt containing the disjoint bits that can be extracted as offset,
+///         or zero if no disjoint bits exist
+APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
+    BinaryOperator *XorInst) {
+  assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
+         "Expected XOR instruction");
+
+  const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
+  Value *BaseOperand;
+  ConstantInt *XorConstant;
+
+  // Match pattern: xor BaseOperand, Constant.
+  if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+    return APInt::getZero(BitWidth);
+
+  // Compute known bits for the base operand.
+  const SimplifyQuery SQ(DL);
+  const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
+  const APInt &ConstantValue = XorConstant->getValue();
+
+  // Identify disjoint bits: constant bits that are known zero in base.
+  const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
+
+  // Early exit if no disjoint bits found.
+  if (DisjointBits.isZero())
+    return APInt::getZero(BitWidth);
+
+  // Compute the remaining non-disjoint bits that stay in the XOR.
+  const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
+
+  // FIXME: Enhance XOR constant extraction to handle nested binary operations.
+  // Currently we only extract disjoint bits from the immediate XOR constant,
+  // but we could recursively process cases like:
+  //   xor (add %base, C1), C2  ->  add %base, (C1 ^ disjoint_bits(C2))
+  // This requires careful analysis to ensure the transformation preserves
+  // semantics, particularly around sign extension and overflow behavior.
+
+  // Add the non-disjoint constant to the user chain for later transformation
+  // This will replace the original constant in the XOR with the new
+  // constant.
+  UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
+  return DisjointBits;
+}
+
 /// A helper function to check if reassociating through an entry in the user
 /// chain would invalidate the GEP's nuw flag.
 static bool allowsPreservingNUW(const User *U) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a7720b1..cb37ec3e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -499,18 +499,16 @@ class InnerLoopVectorizer {
 public:
   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                       LoopInfo *LI, DominatorTree *DT,
-                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
-                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+                      ElementCount VecWidth,
                       ElementCount MinProfitableTripCount,
                       unsigned UnrollFactor, LoopVectorizationCostModel *CM,
                       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
                       GeneratedRTChecks &RTChecks, VPlan &Plan)
-      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        AC(AC), ORE(ORE), VF(VecWidth),
-        MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
-        Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
-        RTChecks(RTChecks), Plan(Plan),
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
+        VF(VecWidth), MinProfitableTripCount(MinProfitableTripCount),
+        UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Cost(CM),
+        BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
         VectorPHVPBB(cast<VPBasicBlock>(
             Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
 
@@ -584,18 +582,12 @@ protected:
   /// Dominator Tree.
   DominatorTree *DT;
 
-  /// Target Library Info.
-  const TargetLibraryInfo *TLI;
-
   /// Target Transform Info.
   const TargetTransformInfo *TTI;
 
   /// Assumption Cache.
   AssumptionCache *AC;
 
-  /// Interface to emit optimization remarks.
-  OptimizationRemarkEmitter *ORE;
-
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   ElementCount VF;
@@ -617,9 +609,6 @@ protected:
   /// The scalar-loop preheader.
   BasicBlock *LoopScalarPreHeader = nullptr;
 
-  /// Middle Block between the vector and the scalar.
-  BasicBlock *LoopMiddleBlock = nullptr;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
@@ -684,14 +673,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
 public:
   InnerLoopAndEpilogueVectorizer(
       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan,
-      ElementCount VecWidth, ElementCount MinProfitableTripCount,
-      unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, VecWidth,
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+      GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
+      ElementCount MinProfitableTripCount, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
                             MinProfitableTripCount, UnrollFactor, CM, BFI, PSI,
                             Checks, Plan),
         EPI(EPI) {}
@@ -721,16 +708,17 @@ public:
 /// epilogues.
 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
 public:
-  EpilogueVectorizerMainLoop(
-      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
-      : InnerLoopAndEpilogueVectorizer(
-            OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Check,
-            Plan, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF) {}
+  EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, DominatorTree *DT,
+                             const TargetTransformInfo *TTI,
+                             AssumptionCache *AC,
+                             EpilogueLoopVectorizationInfo &EPI,
+                             LoopVectorizationCostModel *CM,
+                             BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                             GeneratedRTChecks &Check, VPlan &Plan)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+                                       BFI, PSI, Check, Plan, EPI.MainLoopVF,
+                                       EPI.MainLoopVF, EPI.MainLoopUF) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (ie the first pass of vplan execution).
   BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
@@ -751,14 +739,13 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 public:
   EpilogueVectorizerEpilogueLoop(
       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
-      DominatorTree *DT, const TargetLibraryInfo *TLI,
-      const TargetTransformInfo *TTI, AssumptionCache *AC,
-      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
-      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
-      ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
-      : InnerLoopAndEpilogueVectorizer(
-            OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, CM, BFI, PSI, Checks,
-            Plan, EPI.EpilogueVF, EPI.EpilogueVF, EPI.EpilogueUF) {
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+      GeneratedRTChecks &Checks, VPlan &Plan)
+      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
+                                       BFI, PSI, Checks, Plan, EPI.EpilogueVF,
+                                       EPI.EpilogueVF, EPI.EpilogueUF) {
     TripCount = EPI.TripCount;
   }
   /// Implements the interface for creating a vectorized skeleton using the
@@ -5227,8 +5214,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
 
   // Get the cost of the scalar memory instruction and address computation.
-  InstructionCost Cost =
-      VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+  InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
+                                                  PtrTy, SE, PtrSCEV, CostKind);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
@@ -5304,7 +5291,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   if (isa<LoadInst>(I)) {
-    return TTI.getAddressComputationCost(PtrTy) +
+    return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
                                CostKind) +
            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy,
@@ -5317,7 +5304,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
   // the actual generated code, which involves extracting the last element of
   // a scalable vector where the lane to extract is unknown at compile time.
-  return TTI.getAddressComputationCost(PtrTy) +
+  return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                              CostKind) +
          (IsLoopInvariantStoreValue
@@ -5335,7 +5322,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   const Value *Ptr = getLoadStorePointerOperand(I);
   Type *PtrTy = toVectorTy(Ptr->getType(), VF);
 
-  return TTI.getAddressComputationCost(PtrTy) +
+  return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
                                     Legal->isMaskRequired(I), Alignment,
                                     CostKind, I);
@@ -5575,7 +5562,7 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
     unsigned AS = getLoadStoreAddressSpace(I);
 
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
-    return TTI.getAddressComputationCost(PtrTy) +
+    return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
                                OpInfo, I);
   }
@@ -7294,15 +7281,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
 
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
-  VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
+  VPlanTransforms::simplifyRecipes(BestVPlan);
   VPlanTransforms::removeBranchOnConst(BestVPlan);
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
-  VPlanTransforms::convertToConcreteRecipes(BestVPlan,
-                                            *Legal->getWidestInductionType());
+  VPlanTransforms::convertToConcreteRecipes(BestVPlan);
   // Regions are dissolved after optimizing for VF and UF, which completely
   // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
@@ -9476,8 +9462,8 @@ static bool processLoopInVPlanNativePath(
 
   {
     GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                           VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, VF.Width, 1, &CM,
+                           BFI, PSI, Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10259,7 +10245,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       // interleave it.
       VPlan &BestPlan = LVP.getPlanFor(VF.Width);
       InnerLoopVectorizer Unroller(
-          L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
+          L, PSE, LI, DT, TTI, AC, ElementCount::getFixed(1),
           ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
 
       // TODO: Move to general VPlan pipeline once epilogue loops are also
@@ -10294,18 +10280,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                           BestEpiPlan);
-        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                           EPI, &CM, BFI, PSI, Checks,
-                                           *BestMainPlan);
+        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
+                                           BFI, PSI, Checks, *BestMainPlan);
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
                                              *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
-        EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
-                                                 ORE, EPI, &CM, BFI, PSI,
-                                                 Checks, BestEpiPlan);
+        EpilogueVectorizerEpilogueLoop EpilogILV(
+            L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan);
         EpilogILV.setTripCount(MainILV.getTripCount());
         preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
 
@@ -10330,7 +10314,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       } else {
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width,
                                VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
                                Checks, BestPlan);
         // TODO: Move to general VPlan pipeline once epilogue loops are also
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9801117..b9b3314 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5689,7 +5689,8 @@ private:
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
-                               BoUpSLP *SLP);
+                               BoUpSLP *SLP,
+                               ArrayRef<ScheduleData *> ControlDeps = {});
 
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
@@ -20727,15 +20728,21 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
 
   auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
-    // Clear deps or reculate the region, if the memory instruction is a
-    // copyable. It may have memory deps, which must be reaculated.
+    // Clear deps or recalculate the region, if the memory instruction is a
+    // copyable. It may have memory deps, which must be recalculated.
+    SmallVector<ScheduleData *> ControlDependentMembers;
     auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
       SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
       for (ScheduleEntity *SE : Bundle.getBundle()) {
         if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
           if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
-              BundleMember && BundleMember->hasValidDependencies())
+              BundleMember && BundleMember->hasValidDependencies()) {
             BundleMember->clearDirectDependencies();
+            if (RegionHasStackSave ||
+                !isGuaranteedToTransferExecutionToSuccessor(
+                    BundleMember->getInst()))
+              ControlDependentMembers.push_back(BundleMember);
+          }
           continue;
         }
         auto *SD = cast<ScheduleData>(SE);
@@ -20748,8 +20755,12 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           if (auto *Op = dyn_cast<Instruction>(U.get());
               Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
                                                          *SLP, NumOps)) {
-            if (ScheduleData *OpSD = getScheduleData(Op))
+            if (ScheduleData *OpSD = getScheduleData(Op)) {
               OpSD->clearDirectDependencies();
+              if (RegionHasStackSave ||
+                  !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+                ControlDependentMembers.push_back(OpSD);
+            }
           }
         }
       }
@@ -20783,7 +20794,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         CheckIfNeedToClearDeps(Bundle);
       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
                         << BB->getName() << "\n");
-      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
+      calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
+                            ControlDependentMembers);
     }
 
     if (ReSchedule) {
@@ -21048,9 +21060,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   }
 }
 
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
-                                                     bool InsertInReadyList,
-                                                     BoUpSLP *SLP) {
+void BoUpSLP::BlockScheduling::calculateDependencies(
+    ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
+    ArrayRef<ScheduleData *> ControlDeps) {
   SmallVector<ScheduleEntity *> WorkList;
   auto ProcessNode = [&](ScheduleEntity *SE) {
     if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
@@ -21293,6 +21305,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
   };
 
   WorkList.push_back(Bundle.getBundle().front());
+  WorkList.append(ControlDeps.begin(), ControlDeps.end());
   SmallPtrSet<ScheduleBundle *, 16> Visited;
   while (!WorkList.empty()) {
     ScheduleEntity *SD = WorkList.pop_back_val();
@@ -21362,7 +21375,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
   });
   // Reset schedule data for copyable elements.
   for (auto &P : ScheduleCopyableDataMap) {
-    if (isInSchedulingRegion(*P.second.get())) {
+    if (isInSchedulingRegion(*P.second)) {
       P.second->setScheduled(/*Scheduled=*/false);
       P.second->resetUnscheduledDeps();
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 8818843..9f036fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -200,15 +200,11 @@ template <typename Ops_t, unsigned Opcode, bool Commutative,
 struct Recipe_match {
   Ops_t Ops;
 
-  Recipe_match() : Ops() {
-    static_assert(std::tuple_size<Ops_t>::value == 0 &&
-                  "constructor can only be used with zero operands");
-  }
-  Recipe_match(Ops_t Ops) : Ops(Ops) {}
-  template <typename A_t, typename B_t>
-  Recipe_match(A_t A, B_t B) : Ops({A, B}) {
-    static_assert(std::tuple_size<Ops_t>::value == 2 &&
-                  "constructor can only be used for binary matcher");
+  template <typename... OpTy> Recipe_match(OpTy... Ops) : Ops(Ops...) {
+    static_assert(std::tuple_size<Ops_t>::value == sizeof...(Ops) &&
+                  "number of operands in constructor doesn't match Ops_t");
+    static_assert((!Commutative || std::tuple_size<Ops_t>::value == 2) &&
+                  "only binary ops can be commutative");
   }
 
   bool match(const VPValue *V) const {
@@ -254,7 +250,6 @@ private:
     // Check for recipes that do not have opcodes.
     if constexpr (std::is_same<RecipeTy, VPScalarIVStepsRecipe>::value ||
                   std::is_same<RecipeTy, VPCanonicalIVPHIRecipe>::value ||
-                  std::is_same<RecipeTy, VPWidenSelectRecipe>::value ||
                   std::is_same<RecipeTy, VPDerivedIVRecipe>::value ||
                   std::is_same<RecipeTy, VPWidenGEPRecipe>::value)
       return DefR;
@@ -270,195 +265,128 @@ private:
   }
 };
 
-template <unsigned Opcode, typename... RecipeTys>
-using ZeroOpRecipe_match =
-    Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
-using UnaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>;
-
-template <typename Op0_t, unsigned Opcode>
-using UnaryVPInstruction_match =
-    UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_match =
+    Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ false,
+                 VPWidenRecipe, VPReplicateRecipe, VPWidenCastRecipe,
+                 VPInstruction, VPWidenSelectRecipe>;
 
-template <unsigned Opcode>
-using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using AllRecipe_commutative_match =
+    Recipe_match<std::tuple<OpTys...>, Opcode, /*Commutative*/ true,
+                 VPWidenRecipe, VPReplicateRecipe, VPInstruction>;
 
-template <typename Op0_t, unsigned Opcode>
-using AllUnaryRecipe_match =
-    UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
-                      VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+using VPInstruction_match = Recipe_match<std::tuple<OpTys...>, Opcode,
+                                         /*Commutative*/ false, VPInstruction>;
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutative,
-          typename... RecipeTys>
-using BinaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t, Op1_t>, Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
-using BinaryVPInstruction_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, /*Commutative*/ false,
-                       VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode,
-          bool Commutative, typename... RecipeTys>
-using TernaryRecipe_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>,
-                                         Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using TernaryVPInstruction_match =
-    TernaryRecipe_match<Op0_t, Op1_t, Op2_t, Opcode, /*Commutative*/ false,
-                        VPInstruction>;
-
-template <typename Op0_t, typename Op1_t, unsigned Opcode,
-          bool Commutative = false>
-using AllBinaryRecipe_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
-                       VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
+template <unsigned Opcode, typename... OpTys>
+inline VPInstruction_match<Opcode, OpTys...>
+m_VPInstruction(const OpTys &...Ops) {
+  return VPInstruction_match<Opcode, OpTys...>(Ops...);
+}
 
 /// BuildVector is matches only its opcode, w/o matching its operands as the
 /// number of operands is not fixed.
-inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
-  return ZeroOpVPInstruction_match<VPInstruction::BuildVector>();
-}
-
-template <unsigned Opcode, typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Opcode>
-m_VPInstruction(const Op0_t &Op0) {
-  return UnaryVPInstruction_match<Op0_t, Opcode>(Op0);
-}
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) {
-  return BinaryVPInstruction_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+inline VPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
+  return m_VPInstruction<VPInstruction::BuildVector>();
 }
 
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t>
-inline TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
-  return TernaryVPInstruction_match<Op0_t, Op1_t, Op2_t, Opcode>(
-      {Op0, Op1, Op2});
-}
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
-          unsigned Opcode, bool Commutative, typename... RecipeTys>
-using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>,
-                                     Opcode, Commutative, RecipeTys...>;
-
-template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
-          unsigned Opcode>
-using VPInstruction4Op_match =
-    Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false,
-                    VPInstruction>;
-
-template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t,
-          typename Op3_t>
-inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>
-m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2,
-                const Op3_t &Op3) {
-  return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>(
-      {Op0, Op1, Op2, Op3});
-}
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze>
+inline VPInstruction_match<Instruction::Freeze, Op0_t>
 m_Freeze(const Op0_t &Op0) {
   return m_VPInstruction<Instruction::Freeze>(Op0);
 }
 
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::BranchOnCond>
+inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
 m_BranchOnCond(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
 }
 
 template <typename Op0_t>
-inline UnaryVPInstruction_match<Op0_t, VPInstruction::Broadcast>
+inline VPInstruction_match<VPInstruction::Broadcast, Op0_t>
 m_Broadcast(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::Broadcast>(Op0);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
+inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
 m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
+inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
 m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
 }
 
 template <unsigned Opcode, typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) {
-  return AllUnaryRecipe_match<Op0_t, Opcode>(Op0);
+inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
+  return AllRecipe_match<Opcode, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc>
-m_Trunc(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::Trunc, Op0_t> m_Trunc(const Op0_t &Op0) {
   return m_Unary<Instruction::Trunc, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::ZExt, Op0_t> m_ZExt(const Op0_t &Op0) {
   return m_Unary<Instruction::ZExt, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) {
+inline AllRecipe_match<Instruction::SExt, Op0_t> m_SExt(const Op0_t &Op0) {
   return m_Unary<Instruction::SExt, Op0_t>(Op0);
 }
 
 template <typename Op0_t>
-inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>,
-                        AllUnaryRecipe_match<Op0_t, Instruction::SExt>>
+inline match_combine_or<AllRecipe_match<Instruction::ZExt, Op0_t>,
+                        AllRecipe_match<Instruction::SExt, Op0_t>>
 m_ZExtOrSExt(const Op0_t &Op0) {
   return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
 }
 
-template <unsigned Opcode, typename Op0_t, typename Op1_t,
-          bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>
-m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
-  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Opcode, Op0_t, Op1_t> m_Binary(const Op0_t &Op0,
+                                                      const Op1_t &Op1) {
+  return AllRecipe_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>
+inline AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>
 m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
-  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, true>(Op0, Op1);
+  return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
-m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+inline AllRecipe_match<Instruction::Mul, Op0_t, Op1_t> m_Mul(const Op0_t &Op0,
+                                                             const Op1_t &Op1) {
   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul,
-                             /* Commutative =*/true>
+inline AllRecipe_commutative_match<Instruction::Mul, Op0_t, Op1_t>
 m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1);
+  return m_c_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
 /// Match a binary OR operation. Note that while conceptually the operands can
 /// be matched commutatively, \p Commutative defaults to false in line with the
 /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
 /// version of the matcher.
-template <typename Op0_t, typename Op1_t, bool Commutative = false>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, Commutative>
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Or, Op0_t, Op1_t>
 m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_Binary<Instruction::Or, Op0_t, Op1_t, Commutative>(Op0, Op1);
+  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or,
-                             /*Commutative*/ true>
+inline AllRecipe_commutative_match<Instruction::Or, Op0_t, Op1_t>
 m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
+  return m_c_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
 
 /// ICmp_match is a variant of BinaryRecipe_match that also binds the comparison
@@ -523,9 +451,9 @@ m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
 
 template <typename Op0_t, typename Op1_t>
 using GEPLikeRecipe_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
-                       VPWidenRecipe, VPReplicateRecipe, VPWidenGEPRecipe,
-                       VPInstruction>;
+    Recipe_match<std::tuple<Op0_t, Op1_t>, Instruction::GetElementPtr,
+                 /*Commutative*/ false, VPWidenRecipe, VPReplicateRecipe,
+                 VPWidenGEPRecipe, VPInstruction>;
 
 template <typename Op0_t, typename Op1_t>
 inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
@@ -533,22 +461,17 @@ inline GEPLikeRecipe_match<Op0_t, Op1_t> m_GetElementPtr(const Op0_t &Op0,
   return GEPLikeRecipe_match<Op0_t, Op1_t>(Op0, Op1);
 }
 
-template <typename Op0_t, typename Op1_t, typename Op2_t, unsigned Opcode>
-using AllTernaryRecipe_match =
-    Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, Opcode, false,
-                 VPReplicateRecipe, VPInstruction, VPWidenSelectRecipe>;
-
 template <typename Op0_t, typename Op1_t, typename Op2_t>
-inline AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>
 m_Select(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
-  return AllTernaryRecipe_match<Op0_t, Op1_t, Op2_t, Instruction::Select>(
+  return AllRecipe_match<Instruction::Select, Op0_t, Op1_t, Op2_t>(
       {Op0, Op1, Op2});
 }
 
 template <typename Op0_t>
-inline match_combine_or<UnaryVPInstruction_match<Op0_t, VPInstruction::Not>,
-                        AllBinaryRecipe_match<int_pred_ty<is_all_ones>, Op0_t,
-                                              Instruction::Xor, true>>
+inline match_combine_or<VPInstruction_match<VPInstruction::Not, Op0_t>,
+                        AllRecipe_commutative_match<
+                            Instruction::Xor, int_pred_ty<is_all_ones>, Op0_t>>
 m_Not(const Op0_t &Op0) {
   return m_CombineOr(m_VPInstruction<VPInstruction::Not>(Op0),
                      m_c_Binary<Instruction::Xor>(m_AllOnes(), Op0));
@@ -556,9 +479,8 @@ m_Not(const Op0_t &Op0) {
 
 template <typename Op0_t, typename Op1_t>
 inline match_combine_or<
-    BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>,
-    AllTernaryRecipe_match<Op0_t, Op1_t, specific_intval<1>,
-                           Instruction::Select>>
+    VPInstruction_match<VPInstruction::LogicalAnd, Op0_t, Op1_t>,
+    AllRecipe_match<Instruction::Select, Op0_t, Op1_t, specific_intval<1>>>
 m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
   return m_CombineOr(
       m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1),
@@ -566,15 +488,14 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
 }
 
 template <typename Op0_t, typename Op1_t>
-inline AllTernaryRecipe_match<Op0_t, specific_intval<1>, Op1_t,
-                              Instruction::Select>
+inline AllRecipe_match<Instruction::Select, Op0_t, specific_intval<1>, Op1_t>
 m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Select(Op0, m_True(), Op1);
 }
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>
-using VPScalarIVSteps_match =
-    TernaryRecipe_match<Op0_t, Op1_t, Op2_t, 0, false, VPScalarIVStepsRecipe>;
+using VPScalarIVSteps_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t>, 0,
+                                           false, VPScalarIVStepsRecipe>;
 
 template <typename Op0_t, typename Op1_t, typename Op2_t>
 inline VPScalarIVSteps_match<Op0_t, Op1_t, Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 23c10d2b..7bbd0dc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3130,7 +3130,8 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     Type *PtrTy = toVectorTy(Ptr->getType(), VF);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
-    return Ctx.TTI.getAddressComputationCost(PtrTy) +
+    return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
+                                             Ctx.CostKind) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
                                           Ctx.CostKind, &Ingredient);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f75b2f2..c999ef2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1231,7 +1231,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 }
 
-void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
+void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
   VPTypeAnalysis TypeInfo(Plan);
@@ -1498,7 +1498,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // the region, otherwise replace the terminator controlling the latch with
   // (BranchOnCond true).
   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
-  auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
   if (all_of(Header->phis(),
              IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
                      VPFirstOrderRecurrencePHIRecipe, VPPhi>)) {
@@ -1518,7 +1517,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
     VPBlockUtils::connectBlocks(Preheader, Header);
     VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
-    VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
+    VPlanTransforms::simplifyRecipes(Plan);
   } else {
     // The vector region contains header phis for which we cannot remove the
     // loop region yet.
@@ -1932,13 +1931,13 @@ void VPlanTransforms::optimize(VPlan &Plan) {
   runPass(removeRedundantCanonicalIVs, Plan);
   runPass(removeRedundantInductionCasts, Plan);
 
-  runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+  runPass(simplifyRecipes, Plan);
   runPass(simplifyBlends, Plan);
   runPass(removeDeadRecipes, Plan);
   runPass(narrowToSingleScalarRecipes, Plan);
   runPass(legalizeAndOptimizeInductions, Plan);
   runPass(removeRedundantExpandSCEVRecipes, Plan);
-  runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
+  runPass(simplifyRecipes, Plan);
   runPass(removeBranchOnConst, Plan);
   runPass(removeDeadRecipes, Plan);
 
@@ -2853,8 +2852,7 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
     R->dissolveToCFGLoop();
 }
 
-void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
-                                               Type &CanonicalIVTy) {
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   VPTypeAnalysis TypeInfo(Plan);
   SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5de1483..35fa45ced 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -229,9 +229,8 @@ struct VPlanTransforms {
   ///    EVLIVInc, TripCount).
   static void canonicalizeEVLLoops(VPlan &Plan);
 
-  /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
-  /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
-  static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
+  /// Lower abstract recipes to concrete ones, that can be codegen'd.
+  static void convertToConcreteRecipes(VPlan &Plan);
 
   /// This function converts initial recipes to the abstract recipes and clamps
   /// \p Range based on cost model for following optimizations and cost
@@ -240,9 +239,8 @@ struct VPlanTransforms {
   static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                        VFRange &Range);
 
-  /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
-  /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
-  static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
+  /// Perform instcombine-like simplifications on recipes in \p Plan.
+  static void simplifyRecipes(VPlan &Plan);
 
   /// Remove BranchOnCond recipes with true or false conditions together with
   /// removing dead edges to their successors.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c45005d..4a681cb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1796,8 +1796,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     ScalarizedCost +=
         TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
                             Align(1), LI->getPointerAddressSpace(), CostKind);
-    ScalarizedCost +=
-        TTI.getAddressComputationCost(LI->getPointerOperandType());
+    ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
+                                                    nullptr, nullptr, CostKind);
   }
 
   LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
index 6a327cf..ef0b28e 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-shuffle-loadstore.ll
@@ -7,41 +7,41 @@
 define void @vld2(ptr %p) {
 ; CHECK-LABEL: 'vld2'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
@@ -252,88 +252,171 @@ define void @vld3(ptr %p) {
 }
 
 define void @vld4(ptr %p) {
-; CHECK-LABEL: 'vld4'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF2-LABEL: 'vld4'
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-UF4-LABEL: 'vld4'
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2560 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = load <8 x i8>, ptr %p
   %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
@@ -424,29 +507,29 @@ define void @vld4(ptr %p) {
 
 define void @vst2(ptr %p) {
 ; CHECK-LABEL: 'vst2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4i8, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i16> %v4i16, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> %v4i32, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i64> %v4i64, ptr %p, align 32
@@ -575,40 +658,75 @@ define void @vst3(ptr %p) {
 
 
 define void @vst4(ptr %p) {
-; CHECK-LABEL: 'vst4'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-UF2-LABEL: 'vst4'
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
+; CHECK-UF2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-UF4-LABEL: 'vst4'
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> %v8i8, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16i8, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <32 x i8> %v32i8, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <64 x i8> %v64i8, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v8i16, ptr %p, align 8
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i16> %v16i16, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i16> %v32i16, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i16> %v64i16, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <8 x i32> %v8i32, ptr %p, align 32
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <16 x i32> %v16i32, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <32 x i32> %v32i32, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <64 x i32> %v64i32, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i64> %v8i64, ptr %p, align 64
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i64> %v16i64, ptr %p, align 128
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 1024 for instruction: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i64> %v32i64, ptr %p, align 256
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 2048 for instruction: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i64> %v64i64, ptr %p, align 512
+; CHECK-UF4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
   store <8 x i8> %v8i8, ptr %p
@@ -648,6 +766,3 @@ define void @vst4(ptr %p) {
 
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-UF2: {{.*}}
-; CHECK-UF4: {{.*}}
diff --git a/llvm/test/Bindings/llvm-c/add_globaldebuginfo.ll b/llvm/test/Bindings/llvm-c/add_globaldebuginfo.ll
new file mode 100644
index 0000000..da6536a
--- /dev/null
+++ b/llvm/test/Bindings/llvm-c/add_globaldebuginfo.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-c-test --add-globaldebuginfo < /dev/null
+; This used to trigger an assertion
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index d2f72ec..20f19fdd 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -617,6 +617,31 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
   ret <8 x i64> %d
 }
 
+define <8 x i32> @scalar_i128(<2 x i128> %a) {
+; CHECK-SD-LABEL: scalar_i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d1, x2
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    mov v1.d[1], x3
+; CHECK-SD-NEXT:    mov v0.d[1], x1
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v1.4s
+; CHECK-GI-NEXT:    ret
+  %c = bitcast <2 x i128> %a to <8 x i32>
+  %d = add <8 x i32> %c, %c
+  ret <8 x i32> %d
+}
+
 ; ===== Vectors with Non-Pow 2 Widths =====
 
 define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
index a707081..57b7a82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
@@ -2,6 +2,103 @@
 # RUN: llc -mtriple=amdgcn -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
 
 ---
+name: gep_p0_s_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: gep_p0_s_k
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_s_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: gep_p0_s_s
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p0) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $sgpr2_sgpr3
+    %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: gep_p0_v_k
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: gep_p0_v_s
+    ; CHECK: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY2]](s64)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: gep_p0_v_v
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p0) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
 name: gep_p1_s_k
 legalized: true
 
@@ -97,3 +194,294 @@ body: |
     %1:_(s64) = COPY $vgpr2_vgpr3
     %2:_(p1) = G_PTR_ADD %0, %1
 ...
+
+---
+name: gep_p3_s_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: gep_p3_s_k
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    %0:_(p3) = COPY $sgpr0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_s_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+
+    ; CHECK-LABEL: name: gep_p3_s_s
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p3) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: gep_p3_v_k
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+
+    ; CHECK-LABEL: name: gep_p3_v_s
+    ; CHECK: liveins: $vgpr0, $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY2]](s32)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: gep_p3_v_v
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p3) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_s_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: gep_p4_s_k
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_s_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: gep_p4_s_s
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(s64) = COPY $sgpr2_sgpr3
+    %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: gep_p4_v_k
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p4) = COPY $vgpr0_vgpr1
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: gep_p4_v_s
+    ; CHECK: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY2]](s64)
+    %0:_(p4) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $sgpr0_sgpr1
+    %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: gep_p4_v_v
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+    %0:_(p4) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_s_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: gep_p5_s_k
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    %0:_(p5) = COPY $sgpr0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_s_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+
+    ; CHECK-LABEL: name: gep_p5_s_s
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p5) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_k
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: gep_p5_v_k
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p5) = COPY $vgpr0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $sgpr0
+
+    ; CHECK-LABEL: name: gep_p5_v_s
+    ; CHECK: liveins: $vgpr0, $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY2]](s32)
+    %0:_(p5) = COPY $vgpr0
+    %1:_(s32) = COPY $sgpr0
+    %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: gep_p5_v_v
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+    %0:_(p5) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p5) = G_PTR_ADD %0, %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 9f7f228..535e02c 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -18,6 +18,12 @@ define amdgpu_kernel void @v_input_output_i8() {
   ret void
 }
 
+; GCN: error: couldn't allocate input reg for constraint 'v'
+define amdgpu_kernel void @v_input_empty_struct() {
+  call void asm "", "v"({} poison)
+  ret void
+}
+
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
diff --git a/llvm/test/CodeGen/AMDGPU/unify-metadata.ll b/llvm/test/CodeGen/AMDGPU/unify-metadata.ll
deleted file mode 100644
index 455993b..0000000
--- a/llvm/test/CodeGen/AMDGPU/unify-metadata.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-unify-metadata -S < %s | FileCheck -check-prefix=ALL %s
-; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-unify-metadata -S < %s | FileCheck -check-prefix=ALL %s
-
-; This test check that we have a singe metadata value after linking several
-; modules for records such as opencl.ocl.version, llvm.ident and similar.
-
-; ALL-DAG: !opencl.ocl.version = !{![[OCL_VER:[0-9]+]]}
-; ALL-DAG: !llvm.ident = !{![[LLVM_IDENT_0:[0-9]+]], ![[LLVM_IDENT_1:[0-9]+]]}
-; ALL-DAG: !opencl.used.extensions = !{![[USED_EXT_0:[0-9]+]], ![[USED_EXT_1:[0-9]+]], ![[USED_EXT_2:[0-9]+]]}
-
-; ALL-DAG: ![[OCL_VER]] = !{i32 1, i32 2}
-; ALL-DAG: ![[LLVM_IDENT_0]] = !{!"clang version 4.0"}
-; ALL-DAG: ![[LLVM_IDENT_1]] = !{!"clang version 4.0 (rLXXXXXX)"}
-; ALL-DAG: ![[USED_EXT_0]] = !{!"cl_images"}
-; ALL-DAG: ![[USED_EXT_1]] = !{!"cl_khr_fp16"}
-; ALL-DAG: ![[USED_EXT_2]] = !{!"cl_doubles"}
-
-!opencl.ocl.version = !{!1, !0, !0, !0}
-!llvm.ident = !{!2, !2, !2, !2, !6}
-!opencl.used.extensions = !{!3, !3, !4, !5}
-
-!0 = !{i32 2, i32 0}
-!1 = !{i32 1, i32 2}
-!2 = !{!"clang version 4.0"}
-!3 = !{!"cl_images", !"cl_khr_fp16"}
-!4 = !{!"cl_images", !"cl_doubles"}
-!5 = !{}
-!6 = !{!"clang version 4.0 (rLXXXXXX)"}
diff --git a/llvm/test/CodeGen/ARM/vtrn.ll b/llvm/test/CodeGen/ARM/vtrn.ll
index 136fec3..6377469 100644
--- a/llvm/test/CodeGen/ARM/vtrn.ll
+++ b/llvm/test/CodeGen/ARM/vtrn.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
@@ -20,11 +21,11 @@ define <8 x i8> @vtrni8(ptr %A, ptr %B) nounwind {
 define <16 x i8> @vtrni8_Qres(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vtrni8_Qres:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
-; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
-; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.8 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d17
+; CHECK-NEXT:    vmov r2, r3, d16
 ; CHECK-NEXT:    mov pc, lr
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -52,11 +53,11 @@ define <4 x i16> @vtrni16(ptr %A, ptr %B) nounwind {
 define <8 x i16> @vtrni16_Qres(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vtrni16_Qres:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT:    vtrn.16 [[LDR0]], [[LDR1]]
-; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
-; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.16 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d17
+; CHECK-NEXT:    vmov r2, r3, d16
 ; CHECK-NEXT:    mov pc, lr
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -84,11 +85,11 @@ define <2 x i32> @vtrni32(ptr %A, ptr %B) nounwind {
 define <4 x i32> @vtrni32_Qres(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vtrni32_Qres:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
-; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
-; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.32 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d17
+; CHECK-NEXT:    vmov r2, r3, d16
 ; CHECK-NEXT:    mov pc, lr
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -116,11 +117,11 @@ define <2 x float> @vtrnf(ptr %A, ptr %B) nounwind {
 define <4 x float> @vtrnf_Qres(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vtrnf_Qres:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT:    vtrn.32 [[LDR0]], [[LDR1]]
-; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
-; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.32 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d17
+; CHECK-NEXT:    vmov r2, r3, d16
 ; CHECK-NEXT:    mov pc, lr
   %tmp1 = load <2 x float>, ptr %A
   %tmp2 = load <2 x float>, ptr %B
@@ -281,11 +282,11 @@ define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
 define <16 x i8> @vtrni8_undef_Qres(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: vtrni8_undef_Qres:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vldr [[LDR1:d[0-9]+]], [r1]
-; CHECK-NEXT:    vldr [[LDR0:d[0-9]+]], [r0]
-; CHECK-NEXT:    vtrn.8 [[LDR0]], [[LDR1]]
-; CHECK-NEXT:    vmov r0, r1, [[LDR0]]
-; CHECK-NEXT:    vmov r2, r3, [[LDR1]]
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.8 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d17
+; CHECK-NEXT:    vmov r2, r3, d16
 ; CHECK-NEXT:    mov pc, lr
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -327,9 +328,15 @@ define <16 x i16> @vtrnQi16_undef_QQres(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @vtrn_lower_shufflemask_undef(ptr %A, ptr %B) {
+; CHECK-LABEL: vtrn_lower_shufflemask_undef:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vtrn.16 d17, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d16
+; CHECK-NEXT:    mov pc, lr
 entry:
-  ; CHECK-LABEL: vtrn_lower_shufflemask_undef
-  ; CHECK: vtrn
 	%tmp1 = load <4 x i16>, ptr %A
 	%tmp2 = load <4 x i16>, ptr %B
   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
@@ -340,12 +347,26 @@ entry:
 ; values do modify the type. However, we get different input types, as some of
 ; them get truncated from i32 to i8 (from comparing cmp0 with cmp1) and some of
 ; them get truncated from i16 to i8 (from comparing cmp2 with cmp3).
-define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
-                                             <4 x i32> %cmp0, <4 x i32> %cmp1,
-                                             <4 x i16> %cmp2, <4 x i16> %cmp3) {
-  ; CHECK-LABEL: vtrn_mismatched_builvector0:
-  ; CHECK: vmovn.i32
-  ; CHECK: vbsl
+define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i16> %cmp2, <4 x i16> %cmp3) {
+; CHECK-LABEL: vtrn_mismatched_builvector0:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vldr d20, [sp, #32]
+; CHECK-NEXT:    vldr d18, [sp, #40]
+; CHECK-NEXT:    vcgt.u16 d18, d18, d20
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vtrn.8 d16, d18
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshr.s8 d16, d16, #7
+; CHECK-NEXT:    vbsl d16, d18, d17
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
   %c1 = icmp ult <4 x i16> %cmp2, %cmp3
   %c = shufflevector <4 x i1> %c0, <4 x i1> %c1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -356,12 +377,30 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1,
 ; Here we get a build_vector node, where half the incoming extract_element
 ; values do not modify the type (the values form cmp2), but half of them do
 ; (from the icmp operation).
-define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
-                           <4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) {
-  ; CHECK-LABEL: vtrn_mismatched_builvector1:
-  ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
-  ; CHECK: vmovl
-  ; CHECK: vbsl
+; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn
+define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, ptr %cmp2_ptr) {
+; CHECK-LABEL: vtrn_mismatched_builvector1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    add r12, sp, #8
+; CHECK-NEXT:    add lr, sp, #24
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    ldr r12, [sp, #40]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [lr]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vld1.32 {d18[0]}, [r12:32]
+; CHECK-NEXT:    vmovl.u8 q9, d18
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vtrn.8 d16, d18
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshr.s8 d16, d16, #7
+; CHECK-NEXT:    vbsl d16, d18, d17
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
   %cmp2_load = load <4 x i8>, ptr %cmp2_ptr, align 4
   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
   %c0 = icmp ult <4 x i32> %cmp0, %cmp1
@@ -373,15 +412,15 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
 ; The shuffle mask is half a vtrn; we duplicate the half to produce the
 ; full result.
 define void @lower_twice_no_vtrn(ptr %A, ptr %B, ptr %C) {
+; CHECK-LABEL: lower_twice_no_vtrn:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d18, [r0]
+; CHECK-NEXT:    vtrn.16 d18, d16
+; CHECK-NEXT:    vorr d17, d16, d16
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r2]
+; CHECK-NEXT:    mov pc, lr
 entry:
-  ; CHECK-LABEL: lower_twice_no_vtrn:
-  ; CHECK: @ %bb.0:
-  ; CHECK-NEXT: vldr d16, [r1]
-  ; CHECK-NEXT: vldr d18, [r0]
-  ; CHECK-NEXT: vtrn.16 d18, d16
-  ; CHECK-NEXT: vorr d17, d16, d16
-  ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
-  ; CHECK-NEXT: mov pc, lr
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
@@ -392,18 +431,49 @@ entry:
 ; The shuffle mask is half a vtrn; we duplicate the half to produce the
 ; full result.
 define void @upper_twice_no_vtrn(ptr %A, ptr %B, ptr %C) {
+; CHECK-LABEL: upper_twice_no_vtrn:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr d16, [r1]
+; CHECK-NEXT:    vldr d18, [r0]
+; CHECK-NEXT:    vtrn.16 d18, d16
+; CHECK-NEXT:    vorr d19, d18, d18
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r2]
+; CHECK-NEXT:    mov pc, lr
 entry:
-  ; CHECK-LABEL: upper_twice_no_vtrn:
-  ; CHECK: @ %bb.0:
-  ; CHECK-NEXT: vldr d16, [r1]
-  ; CHECK-NEXT: vldr d18, [r0]
-  ; CHECK-NEXT: vtrn.16 d18, d16
-  ; CHECK-NEXT: vorr d19, d18, d18
-  ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
-  ; CHECK-NEXT: mov pc, lr
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
   store <8 x i16> %0, ptr %C
   ret void
 }
+
+define void @test_15xi16(ptr %next.gep, ptr %next.gep13) {
+; CHECK-LABEL: test_15xi16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    add r2, r0, #2
+; CHECK-NEXT:    add r3, r0, #6
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r2]!
+; CHECK-NEXT:    vld1.16 {d18}, [r2]!
+; CHECK-NEXT:    vld1.16 {d20, d21}, [r3]!
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    vld1.16 {d22}, [r3]!
+; CHECK-NEXT:    vmov.16 d19[0], r2
+; CHECK-NEXT:    ldr r3, [r3]
+; CHECK-NEXT:    add r2, r0, #30
+; CHECK-NEXT:    add r0, r0, #34
+; CHECK-NEXT:    vmov.16 d19[1], r3
+; CHECK-NEXT:    vld1.16 {d19[2]}, [r2:16]
+; CHECK-NEXT:    vtrn.16 q8, q10
+; CHECK-NEXT:    vld1.16 {d19[3]}, [r0:16]
+; CHECK-NEXT:    vtrn.16 d18, d22
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r1]!
+; CHECK-NEXT:    vst1.16 {d18, d19}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
+  %b = load <15 x i16>, ptr %a, align 2
+  %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
+  %d = load <15 x i16>, ptr %c, align 2
+  %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 15, i32 2, i32 17, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>
+  store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll
index 7e1dfba..d24dadc 100644
--- a/llvm/test/CodeGen/ARM/vuzp.ll
+++ b/llvm/test/CodeGen/ARM/vuzp.ll
@@ -535,3 +535,59 @@ define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
   %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
   ret %struct.uint8x8x2_t %.fca.0.1.insert
 }
+
+define void @test_15xi16(ptr %next.gep, ptr %next.gep13) {
+; CHECK-LABEL: test_15xi16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    add r2, r0, #2
+; CHECK-NEXT:    add r3, r0, #6
+; CHECK-NEXT:    vld1.16 {d20, d21}, [r2]!
+; CHECK-NEXT:    vld1.16 {d16}, [r2]!
+; CHECK-NEXT:    vmov.u16 r12, d16[0]
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    vmov.u16 r4, d20[0]
+; CHECK-NEXT:    vld1.16 {d22, d23}, [r3]!
+; CHECK-NEXT:    vld1.16 {d24}, [r3]!
+; CHECK-NEXT:    vmov.u16 lr, d16[2]
+; CHECK-NEXT:    vmov.u16 r5, d22[0]
+; CHECK-NEXT:    vmov.u16 r6, d21[0]
+; CHECK-NEXT:    vmov.16 d17[0], r12
+; CHECK-NEXT:    vmov.16 d16[0], r4
+; CHECK-NEXT:    vmov.u16 r4, d24[0]
+; CHECK-NEXT:    vmov.u16 r12, d24[2]
+; CHECK-NEXT:    vmov.16 d17[1], lr
+; CHECK-NEXT:    vmov.16 d18[0], r5
+; CHECK-NEXT:    vmov.u16 r5, d20[2]
+; CHECK-NEXT:    vmov.u16 lr, d23[0]
+; CHECK-NEXT:    vmov.16 d19[0], r4
+; CHECK-NEXT:    vmov.u16 r4, d22[2]
+; CHECK-NEXT:    vmov.16 d16[1], r5
+; CHECK-NEXT:    vmov.u16 r5, d21[2]
+; CHECK-NEXT:    vmov.16 d17[2], r2
+; CHECK-NEXT:    ldr r2, [r3]
+; CHECK-NEXT:    vmov.16 d16[2], r6
+; CHECK-NEXT:    vmov.16 d18[1], r4
+; CHECK-NEXT:    vmov.u16 r4, d23[2]
+; CHECK-NEXT:    vmov.16 d19[1], r12
+; CHECK-NEXT:    vmov.16 d18[2], lr
+; CHECK-NEXT:    vmov.16 d19[2], r2
+; CHECK-NEXT:    add r2, r0, #30
+; CHECK-NEXT:    add r0, r0, #34
+; CHECK-NEXT:    vld1.16 {d17[3]}, [r2:16]
+; CHECK-NEXT:    vmov.16 d16[3], r5
+; CHECK-NEXT:    vmov.16 d18[3], r4
+; CHECK-NEXT:    vld1.16 {d19[3]}, [r0:16]
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r1]!
+; CHECK-NEXT:    vst1.16 {d18, d19}, [r1]
+; CHECK-NEXT:    pop {r4, r5, r6, lr}
+; CHECK-NEXT:    mov pc, lr
+  %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
+  %b = load <15 x i16>, ptr %a, align 2
+  %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
+  %d = load <15 x i16>, ptr %c, align 2
+  %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29>
+  store <16 x i16> %interleaved.vec, ptr %next.gep13, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM/vzip.ll b/llvm/test/CodeGen/ARM/vzip.ll
index dda774a..ce40a2e 100644
--- a/llvm/test/CodeGen/ARM/vzip.ll
+++ b/llvm/test/CodeGen/ARM/vzip.ll
@@ -381,3 +381,22 @@ entry:
   %vzip.i = shufflevector <8 x i8> %lane, <8 x i8> %lane3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x i8> %vzip.i
 }
+
+define <16 x i16> @test_15xi16(ptr %next.gep, ptr %next.gep13) {
+; CHECK-LABEL: test_15xi16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    add r1, r1, #2
+; CHECK-NEXT:    mov r2, #4
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r1], r2
+; CHECK-NEXT:    vld1.16 {d18, d19}, [r1]
+; CHECK-NEXT:    vzip.16 q8, q9
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r0:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    mov pc, lr
+  %a = getelementptr inbounds nuw i8, ptr %next.gep, i32 2
+  %b = load <15 x i16>, ptr %a, align 2
+  %c = getelementptr inbounds nuw i8, ptr %next.gep, i32 6
+  %d = load <15 x i16>, ptr %c, align 2
+  %interleaved.vec = shufflevector <15 x i16> %b, <15 x i16> %d, <16 x i32> <i32 0, i32 15, i32 1, i32 16, i32 2, i32 17, i32 3, i32 18, i32 4, i32 19, i32 5, i32 20, i32 6, i32 21, i32 7, i32 22>
+  ret <16 x i16> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index ac5a214..c074bfe 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -30,8 +30,8 @@ entry:
 define <8 x i32> @insert_extract_v8i32(<8 x i32> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
+; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 7
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <8 x i32> %a, i32 7
@@ -39,6 +39,18 @@ entry:
   ret <8 x i32> %c
 }
 
+
+define <8 x i32> @insert_extract0_v8i32(<8 x i32> %a) nounwind {
+; CHECK-LABEL: insert_extract0_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <8 x i32> %a, i32 0
+  %c = insertelement <8 x i32> %a, i32 %b, i32 1
+  ret <8 x i32> %c
+}
+
 define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
@@ -54,8 +66,8 @@ entry:
 define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
+; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 3
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x i64> %a, i32 3
@@ -63,6 +75,17 @@ entry:
   ret <4 x i64> %c
 }
 
+define <4 x i64> @insert_extract0_v4i64(<4 x i64> %a) nounwind {
+; CHECK-LABEL: insert_extract0_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+entry:
+  %b = extractelement <4 x i64> %a, i32 0
+  %c = insertelement <4 x i64> %a, i64 %b, i32 1
+  ret <4 x i64> %c
+}
+
 define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
index 80627a0..e1d4ef1 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -22,7 +22,7 @@ define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
 ; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
-  %r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
+  %r = call afn <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
@@ -41,7 +41,7 @@ define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
 ; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
-  %r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
+  %r = call afn <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
   ret <2 x bfloat> %r
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 2b7e418..d4aec4f 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -886,8 +886,8 @@ define half @test_sqrt(half %a) #0 {
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
-define half @test_sin(half %a) #0 #1 {
-  %r = call half @llvm.sin.f16(half %a)
+define half @test_sin(half %a) #0 {
+  %r = call afn half @llvm.sin.f16(half %a)
   ret half %r
 }
 
@@ -900,8 +900,8 @@ define half @test_sin(half %a) #0 #1 {
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[RF]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
-define half @test_cos(half %a) #0 #1 {
-  %r = call half @llvm.cos.f16(half %a)
+define half @test_cos(half %a) #0 {
+  %r = call afn half @llvm.cos.f16(half %a)
   ret half %r
 }
 
@@ -1183,4 +1183,3 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 {
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index d4fcea3..991311f 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1674,7 +1674,7 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ;  ret <2 x half> %r
 ;}
 
-define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+define <2 x half> @test_sin(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_sin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -1692,11 +1692,11 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
-  %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+  %r = call afn <2 x half> @llvm.sin.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
-define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+define <2 x half> @test_cos(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_cos(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -1714,7 +1714,7 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    mov.b32 %r6, {%rs4, %rs3};
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
-  %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+  %r = call afn <2 x half> @llvm.cos.f16(<2 x half> %a)
   ret <2 x half> %r
 }
 
@@ -2330,4 +2330,3 @@ define void @test_store_2xhalf(ptr %p1, ptr %p2, <2 x half> %v) {
 
 
 attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 47b7c9a..4674597 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -1638,7 +1638,7 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 {
 ;  ret <2 x float> %r
 ;}
 
-define <2 x float> @test_sin(<2 x float> %a) #0 #1 {
+define <2 x float> @test_sin(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_sin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -1651,11 +1651,11 @@ define <2 x float> @test_sin(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    sin.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-NEXT:    ret;
-  %r = call <2 x float> @llvm.sin(<2 x float> %a)
+  %r = call afn <2 x float> @llvm.sin(<2 x float> %a)
   ret <2 x float> %r
 }
 
-define <2 x float> @test_cos(<2 x float> %a) #0 #1 {
+define <2 x float> @test_cos(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_cos(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -1668,7 +1668,7 @@ define <2 x float> @test_cos(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    cos.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
 ; CHECK-NEXT:    ret;
-  %r = call <2 x float> @llvm.cos(<2 x float> %a)
+  %r = call afn <2 x float> @llvm.cos(<2 x float> %a)
   ret <2 x float> %r
 }
 
@@ -2157,5 +2157,4 @@ define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) {
 
 
 attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
 attributes #2 = { "denormal-fp-math"="preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index 5eda3a1..8561c60 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -22,7 +22,7 @@ define float @sqrt_div(float %a, float %b) {
   ret float %t2
 }
 
-define float @sqrt_div_fast(float %a, float %b) #0 {
+define float @sqrt_div_fast(float %a, float %b) {
 ; CHECK-LABEL: sqrt_div_fast(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -34,29 +34,25 @@ define float @sqrt_div_fast(float %a, float %b) #0 {
 ; CHECK-NEXT:    div.approx.f32 %r4, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
-  %t1 = tail call float @llvm.sqrt.f32(float %a)
-  %t2 = fdiv float %t1, %b
+  %t1 = tail call afn float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv afn float %t1, %b
   ret float %t2
 }
 
-define float @sqrt_div_fast_ninf(float %a, float %b) #0 {
+define float @sqrt_div_fast_ninf(float %a, float %b) {
 ; CHECK-LABEL: sqrt_div_fast_ninf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ninf_param_0];
 ; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
-; CHECK-NEXT:    abs.f32 %r3, %r1;
-; CHECK-NEXT:    setp.lt.f32 %p1, %r3, 0f00800000;
-; CHECK-NEXT:    selp.f32 %r4, 0f00000000, %r2, %p1;
-; CHECK-NEXT:    ld.param.b32 %r5, [sqrt_div_fast_ninf_param_1];
-; CHECK-NEXT:    div.approx.f32 %r6, %r4, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_ninf_param_1];
+; CHECK-NEXT:    div.approx.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
-  %t2 = fdiv float %t1, %b
+  %t2 = fdiv afn float %t1, %b
   ret float %t2
 }
 
@@ -77,7 +73,7 @@ define float @sqrt_div_ftz(float %a, float %b) #1 {
   ret float %t2
 }
 
-define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
+define float @sqrt_div_fast_ftz(float %a, float %b) #1 {
 ; CHECK-LABEL: sqrt_div_fast_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -89,35 +85,32 @@ define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
 ; CHECK-NEXT:    div.approx.ftz.f32 %r4, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
-  %t1 = tail call float @llvm.sqrt.f32(float %a)
-  %t2 = fdiv float %t1, %b
+  %t1 = tail call afn float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv afn float %t1, %b
   ret float %t2
 }
 
-define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
+define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #1 {
 ; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ftz_ninf_param_0];
-; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
 ; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
-; CHECK-NEXT:    selp.f32 %r3, 0f00000000, %r2, %p1;
-; CHECK-NEXT:    ld.param.b32 %r4, [sqrt_div_fast_ftz_ninf_param_1];
-; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_ftz_ninf_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
-  %t2 = fdiv float %t1, %b
+  %t2 = fdiv afn float %t1, %b
   ret float %t2
 }
 
 ; There are no fast-math or ftz versions of sqrt and div for f64.  We use
 ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
 
-define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
+define double @sqrt_div_fast_ftz_f64(double %a, double %b) #1 {
 ; CHECK-LABEL: sqrt_div_fast_ftz_f64(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
@@ -134,22 +127,17 @@ define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
   ret double %t2
 }
 
-define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 {
+define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #1 {
 ; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_ninf_param_0];
-; CHECK-NEXT:    abs.f64 %rd2, %rd1;
-; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
-; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
-; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
-; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
-; CHECK-NEXT:    ld.param.b64 %rd6, [sqrt_div_fast_ftz_f64_ninf_param_1];
-; CHECK-NEXT:    div.rn.f64 %rd7, %rd5, %rd6;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [sqrt_div_fast_ftz_f64_ninf_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd4, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn double @llvm.sqrt.f64(double %a)
   %t2 = fdiv double %t1, %b
@@ -172,7 +160,7 @@ define float @rsqrt(float %a) {
   ret float %ret
 }
 
-define float @rsqrt_fast(float %a) #0 {
+define float @rsqrt_fast(float %a) {
 ; CHECK-LABEL: rsqrt_fast(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -182,12 +170,12 @@ define float @rsqrt_fast(float %a) #0 {
 ; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %b = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv float 1.0, %b
+  %b = tail call afn float @llvm.sqrt.f32(float %a)
+  %ret = fdiv afn float 1.0, %b
   ret float %ret
 }
 
-define float @rsqrt_fast_ftz(float %a) #0 #1 {
+define float @rsqrt_fast_ftz(float %a) #1 {
 ; CHECK-LABEL: rsqrt_fast_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -197,8 +185,8 @@ define float @rsqrt_fast_ftz(float %a) #0 #1 {
 ; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %b = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv float 1.0, %b
+  %b = tail call afn float @llvm.sqrt.f32(float %a)
+  %ret = fdiv afn float 1.0, %b
   ret float %ret
 }
 
@@ -263,35 +251,7 @@ define float @fcos_approx_afn(float %a) {
   ret float %r
 }
 
-define float @fsin_approx(float %a) #0 {
-; CHECK-LABEL: fsin_approx(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [fsin_approx_param_0];
-; CHECK-NEXT:    sin.approx.f32 %r2, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
-  %r = tail call float @llvm.sin.f32(float %a)
-  ret float %r
-}
-
-define float @fcos_approx(float %a) #0 {
-; CHECK-LABEL: fcos_approx(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [fcos_approx_param_0];
-; CHECK-NEXT:    cos.approx.f32 %r2, %r1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
-  %r = tail call float @llvm.cos.f32(float %a)
-  ret float %r
-}
-
-define float @fsin_approx_ftz(float %a) #0 #1 {
+define float @fsin_approx_ftz(float %a) #1 {
 ; CHECK-LABEL: fsin_approx_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -301,11 +261,11 @@ define float @fsin_approx_ftz(float %a) #0 #1 {
 ; CHECK-NEXT:    sin.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %r = tail call float @llvm.sin.f32(float %a)
+  %r = tail call afn float @llvm.sin.f32(float %a)
   ret float %r
 }
 
-define float @fcos_approx_ftz(float %a) #0 #1 {
+define float @fcos_approx_ftz(float %a) #1 {
 ; CHECK-LABEL: fcos_approx_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -315,7 +275,7 @@ define float @fcos_approx_ftz(float %a) #0 #1 {
 ; CHECK-NEXT:    cos.approx.ftz.f32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
-  %r = tail call float @llvm.cos.f32(float %a)
+  %r = tail call afn float @llvm.cos.f32(float %a)
   ret float %r
 }
 
@@ -423,7 +383,7 @@ define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, f
   ret float %w
 }
 
-define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 {
+define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) {
 ; CHECK-LABEL: repeated_div_fast(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -444,14 +404,14 @@ define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0
 ; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
-  %x = fdiv float %a, %divisor
-  %y = fdiv float %b, %divisor
-  %z = fmul float %x, %y
+  %x = fdiv afn arcp float %a, %divisor
+  %y = fdiv afn arcp contract float %b, %divisor
+  %z = fmul contract float %x, %y
   %w = select i1 %pred, float %z, float %y
   ret float %w
 }
 
-define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) #0 {
+define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) {
 ; CHECK-LABEL: repeated_div_fast_sel(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -469,13 +429,13 @@ define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor
 ; CHECK-NEXT:    div.approx.f32 %r5, %r3, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
-  %x = fdiv float %a, %divisor
-  %y = fdiv float %b, %divisor
+  %x = fdiv afn float %a, %divisor
+  %y = fdiv afn float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
+define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #1 {
 ; CHECK-LABEL: repeated_div_fast_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -496,14 +456,14 @@ define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor
 ; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
-  %x = fdiv float %a, %divisor
-  %y = fdiv float %b, %divisor
-  %z = fmul float %x, %y
+  %x = fdiv afn arcp float %a, %divisor
+  %y = fdiv afn arcp contract float %b, %divisor
+  %z = fmul contract float %x, %y
   %w = select i1 %pred, float %z, float %y
   ret float %w
 }
 
-define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
+define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #1 {
 ; CHECK-LABEL: repeated_div_fast_ftz_sel(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -521,13 +481,13 @@ define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %div
 ; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
-  %x = fdiv float %a, %divisor
-  %y = fdiv float %b, %divisor
+  %x = fdiv afn float %a, %divisor
+  %y = fdiv afn float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-define float @frem(float %a, float %b) #0 {
+define float @frem(float %a, float %b) {
 ; CHECK-LABEL: frem(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -541,11 +501,11 @@ define float @frem(float %a, float %b) #0 {
 ; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
-  %rem = frem float %a, %b
+  %rem = frem afn arcp contract ninf float %a, %b
   ret float %rem
 }
 
-define float @frem_ftz(float %a, float %b) #0 #1 {
+define float @frem_ftz(float %a, float %b) #1 {
 ; CHECK-LABEL: frem_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -559,11 +519,11 @@ define float @frem_ftz(float %a, float %b) #0 #1 {
 ; CHECK-NEXT:    fma.rn.ftz.f32 %r6, %r5, %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
-  %rem = frem float %a, %b
+  %rem = frem afn contract ninf float %a, %b
   ret float %rem
 }
 
-define double @frem_f64(double %a, double %b) #0 {
+define double @frem_f64(double %a, double %b) {
 ; CHECK-LABEL: frem_f64(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
@@ -577,9 +537,8 @@ define double @frem_f64(double %a, double %b) #0 {
 ; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
-  %rem = frem double %a, %b
+  %rem = frem ninf double %a, %b
   ret double %rem
 }
 
-attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 2f1d7d6..6d983ba 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -9,7 +9,7 @@
 ; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
 
-define half @fma_f16_no_nans(half %a, half %b, half %c) #0 {
+define half @fma_f16_no_nans(half %a, half %b, half %c) {
 ; CHECK-LABEL: fma_f16_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -49,14 +49,14 @@ define half @fma_f16_no_nans(half %a, half %b, half %c) #0 {
 ; CHECK-SM70-NEXT:    selp.b16 %rs6, %rs4, 0x0000, %p1;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
+  %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c)
   %2 = fcmp ogt half %1, 0.0
-  %3 = select i1 %2, half %1, half 0.0
+  %3 = select nsz i1 %2, half %1, half 0.0
   ret half %3
 }
 
 ; FMA relu shouldn't be selected if the FMA operation has multiple uses
-define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0 {
+define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) {
 ; CHECK-LABEL: fma_f16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<8>;
@@ -103,13 +103,13 @@ define half @fma_f16_no_nans_multiple_uses_of_fma(half %a, half %b, half %c) #0
 ; CHECK-SM70-NEXT:    ret;
   %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
   %2 = fcmp ogt half %1, 0.0
-  %3 = select i1 %2, half %1, half 0.0
-  %4 = fadd half %1, 7.0
-  %5 = fadd half %4, %1
+  %3 = select  i1 %2, half %1, half 0.0
+  %4 = fadd contract half %1, 7.0
+  %5 = fadd contract half %4, %1
   ret half %5
 }
 
-define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) #0 {
+define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) {
 ; CHECK-LABEL: fma_f16_maxnum_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -149,12 +149,12 @@ define half @fma_f16_maxnum_no_nans(half %a, half %b, half %c) #0 {
 ; CHECK-SM70-NEXT:    cvt.rn.f16.f32 %rs5, %r2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call half @llvm.fma.f16(half %a, half %b, half %c)
-  %2 = call half @llvm.maxnum.f16(half %1, half 0.0)
+  %1 = call nnan half @llvm.fma.f16(half %a, half %b, half %c)
+  %2 = call nsz half @llvm.maxnum.f16(half %1, half 0.0)
   ret half %2
 }
 
-define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
+define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) {
 ; CHECK-LABEL: fma_bf16_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -205,14 +205,14 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %2 = fcmp ogt bfloat %1, 0.0
-  %3 = select i1 %2, bfloat %1, bfloat 0.0
+  %3 = select nsz i1 %2, bfloat %1, bfloat 0.0
   ret bfloat %3
 }
 
 ; FMA_relu shouldn't be selected if the FMA operation has multiple uses
-define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) {
 ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
@@ -291,12 +291,12 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %2 = fcmp ogt bfloat %1, 0.0
   %3 = select i1 %2, bfloat %1, bfloat 0.0
-  %4 = fadd bfloat %1, 7.0
-  %5 = fadd bfloat %4, %1
+  %4 = fadd contract bfloat %1, 7.0
+  %5 = fadd contract bfloat %4, %1
   ret bfloat %5
 }
 
-define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
+define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) {
 ; CHECK-LABEL: fma_bf16_maxnum_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<5>;
@@ -351,12 +351,12 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    shr.u32 %r20, %r19, 16;
 ; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %r20;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
-  %2 = call bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
+  %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  %2 = call nsz bfloat @llvm.maxnum.bf16(bfloat %1, bfloat 0.0)
   ret bfloat %2
 }
 
-define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
 ; CHECK-LABEL: fma_f16x2_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -399,14 +399,14 @@ define <2 x half> @fma_f16x2_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; CHECK-SM70-NEXT:    selp.b16 %rs4, %rs1, 0x0000, %p1;
 ; CHECK-SM70-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   %2 = fcmp ogt <2 x half> %1, <half 0.0, half 0.0>
-  %3 = select <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0>
+  %3 = select nsz <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0>
   ret <2 x half> %3
 }
 
 ; FMA relu shouldn't be selected if the FMA operation has multiple uses
-define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
 ; CHECK-LABEL: fma_f16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
@@ -454,12 +454,12 @@ define <2 x half> @fma_f16x2_no_nans_multiple_uses_of_fma(<2 x half> %a, <2 x ha
   %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   %2 = fcmp ogt <2 x half> %1, <half 0.0, half 0.0>
   %3 = select <2 x i1> %2, <2 x half> %1, <2 x half> <half 0.0, half 0.0>
-  %4 = fadd <2 x half> %1, <half 7.0, half 7.0>
-  %5 = fadd <2 x half> %4, %1
+  %4 = fadd contract <2 x half> %1, <half 7.0, half 7.0>
+  %5 = fadd contract <2 x half> %4, %1
   ret <2 x half> %5
 }
 
-define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
 ; CHECK-LABEL: fma_f16x2_maxnum_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -504,12 +504,12 @@ define <2 x half> @fma_f16x2_maxnum_no_nans(<2 x half> %a, <2 x half> %b, <2 x h
 ; CHECK-SM70-NEXT:    mov.b32 %r9, {%rs4, %rs3};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
-  %2 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>)
+  %1 = call nnan <2 x half> @llvm.fma.f16x2(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  %2 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %1, <2 x half> <half 0.0, half 0.0>)
   ret <2 x half> %2
 }
 
-define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
+define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; CHECK-LABEL: fma_bf16x2_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -580,14 +580,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
 ; CHECK-SM70-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
-  %3 = select <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
+  %3 = select nsz <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
   ret <2 x bfloat> %3
 }
 
 ; FMA_relu shouldn't be selected if the FMA operation has multiple uses
-define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
+define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
@@ -707,12 +707,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   %2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
   %3 = select <2 x i1> %2, <2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
-  %4 = fadd <2 x bfloat> %1, <bfloat 7.0, bfloat 7.0>
-  %5 = fadd <2 x bfloat> %4, %1
+  %4 = fadd contract <2 x bfloat> %1, <bfloat 7.0, bfloat 7.0>
+  %5 = fadd contract <2 x bfloat> %4, %1
   ret <2 x bfloat> %5
 }
 
-define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
+define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; CHECK-LABEL: fma_bf16x2_maxnum_no_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
@@ -792,10 +792,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    prmt.b32 %r39, %r38, %r31, 0x7632U;
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r39;
 ; CHECK-SM70-NEXT:    ret;
-  %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
-  %2 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+  %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  %2 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
   ret <2 x bfloat> %2
 }
-
-attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll
index 5805aed..d30c72c 100644
--- a/llvm/test/CodeGen/NVPTX/frem.ll
+++ b/llvm/test/CodeGen/NVPTX/frem.ll
@@ -1,313 +1,316 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s --enable-unsafe-fp-math -mcpu=sm_60 | FileCheck %s --check-prefixes=FAST
-; RUN: llc < %s -mcpu=sm_60 | FileCheck %s --check-prefixes=NORMAL
+; RUN: llc < %s -mcpu=sm_60 | FileCheck %s
 
 
 target triple = "nvptx64-unknown-cuda"
 
 define half @frem_f16(half %a, half %b) {
-; FAST-LABEL: frem_f16(
-; FAST:       {
-; FAST-NEXT:    .reg .b16 %rs<4>;
-; FAST-NEXT:    .reg .b32 %r<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b16 %rs1, [frem_f16_param_0];
-; FAST-NEXT:    ld.param.b16 %rs2, [frem_f16_param_1];
-; FAST-NEXT:    cvt.f32.f16 %r1, %rs2;
-; FAST-NEXT:    cvt.f32.f16 %r2, %rs1;
-; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; FAST-NEXT:    neg.f32 %r5, %r4;
-; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
-; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
-; FAST-NEXT:    st.param.b16 [func_retval0], %rs3;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f16(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b16 %rs<4>;
-; NORMAL-NEXT:    .reg .b32 %r<8>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b16 %rs1, [frem_f16_param_0];
-; NORMAL-NEXT:    ld.param.b16 %rs2, [frem_f16_param_1];
-; NORMAL-NEXT:    cvt.f32.f16 %r1, %rs2;
-; NORMAL-NEXT:    cvt.f32.f16 %r2, %rs1;
-; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; NORMAL-NEXT:    neg.f32 %r5, %r4;
-; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %r1;
-; NORMAL-NEXT:    selp.f32 %r7, %r2, %r6, %p1;
-; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %r7;
-; NORMAL-NEXT:    st.param.b16 [func_retval0], %rs3;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [frem_f16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [frem_f16_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r1;
+; CHECK-NEXT:    selp.f32 %r7, %r2, %r6, %p1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r7;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
   %r = frem half %a, %b
   ret half %r
 }
 
+define half @frem_f16_fast(half %a, half %b) {
+; CHECK-LABEL: frem_f16_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [frem_f16_fast_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [frem_f16_fast_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf half %a, %b
+  ret half %r
+}
+
 define float @frem_f32(float %a, float %b) {
-; FAST-LABEL: frem_f32(
-; FAST:       {
-; FAST-NEXT:    .reg .b32 %r<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_param_0];
-; FAST-NEXT:    ld.param.b32 %r2, [frem_f32_param_1];
-; FAST-NEXT:    div.approx.f32 %r3, %r1, %r2;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; FAST-NEXT:    neg.f32 %r5, %r4;
-; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f32(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b32 %r<8>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_param_0];
-; NORMAL-NEXT:    ld.param.b32 %r2, [frem_f32_param_1];
-; NORMAL-NEXT:    div.rn.f32 %r3, %r1, %r2;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; NORMAL-NEXT:    neg.f32 %r5, %r4;
-; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %r2;
-; NORMAL-NEXT:    selp.f32 %r7, %r1, %r6, %p1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %r7;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_f32_param_1];
+; CHECK-NEXT:    div.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r2;
+; CHECK-NEXT:    selp.f32 %r7, %r1, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %r = frem float %a, %b
   ret float %r
 }
 
+define float @frem_f32_fast(float %a, float %b) {
+; CHECK-LABEL: frem_f32_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_fast_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_f32_fast_param_1];
+; CHECK-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf float %a, %b
+  ret float %r
+}
+
 define double @frem_f64(double %a, double %b) {
-; FAST-LABEL: frem_f64(
-; FAST:       {
-; FAST-NEXT:    .reg .b64 %rd<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
-; FAST-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
-; FAST-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
-; FAST-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
-; FAST-NEXT:    neg.f64 %rd5, %rd4;
-; FAST-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
-; FAST-NEXT:    st.param.b64 [func_retval0], %rd6;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f64(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b64 %rd<8>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
-; NORMAL-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
-; NORMAL-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
-; NORMAL-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
-; NORMAL-NEXT:    neg.f64 %rd5, %rd4;
-; NORMAL-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
-; NORMAL-NEXT:    testp.infinite.f64 %p1, %rd2;
-; NORMAL-NEXT:    selp.f64 %rd7, %rd1, %rd6, %p1;
-; NORMAL-NEXT:    st.param.b64 [func_retval0], %rd7;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    testp.infinite.f64 %p1, %rd2;
+; CHECK-NEXT:    selp.f64 %rd7, %rd1, %rd6, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ret;
   %r = frem double %a, %b
   ret double %r
 }
 
+define double @frem_f64_fast(double %a, double %b) {
+; CHECK-LABEL: frem_f64_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_fast_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_fast_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf double %a, %b
+  ret double %r
+}
+
 define half @frem_f16_ninf(half %a, half %b) {
-; FAST-LABEL: frem_f16_ninf(
-; FAST:       {
-; FAST-NEXT:    .reg .b16 %rs<4>;
-; FAST-NEXT:    .reg .b32 %r<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_param_0];
-; FAST-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_param_1];
-; FAST-NEXT:    cvt.f32.f16 %r1, %rs2;
-; FAST-NEXT:    cvt.f32.f16 %r2, %rs1;
-; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; FAST-NEXT:    neg.f32 %r5, %r4;
-; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
-; FAST-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
-; FAST-NEXT:    st.param.b16 [func_retval0], %rs3;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f16_ninf(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .b16 %rs<4>;
-; NORMAL-NEXT:    .reg .b32 %r<7>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_param_0];
-; NORMAL-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_param_1];
-; NORMAL-NEXT:    cvt.f32.f16 %r1, %rs2;
-; NORMAL-NEXT:    cvt.f32.f16 %r2, %rs1;
-; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; NORMAL-NEXT:    neg.f32 %r5, %r4;
-; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
-; NORMAL-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
-; NORMAL-NEXT:    st.param.b16 [func_retval0], %rs3;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f16_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
   %r = frem ninf half %a, %b
   ret half %r
 }
 
+define half @frem_f16_ninf_fast(half %a, half %b) {
+; CHECK-LABEL: frem_f16_ninf_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [frem_f16_ninf_fast_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [frem_f16_ninf_fast_param_1];
+; CHECK-NEXT:    cvt.f32.f16 %r1, %rs2;
+; CHECK-NEXT:    cvt.f32.f16 %r2, %rs1;
+; CHECK-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf half %a, %b
+  ret half %r
+}
+
 define float @frem_f32_ninf(float %a, float %b) {
-; FAST-LABEL: frem_f32_ninf(
-; FAST:       {
-; FAST-NEXT:    .reg .b32 %r<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_param_0];
-; FAST-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_param_1];
-; FAST-NEXT:    div.approx.f32 %r3, %r1, %r2;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; FAST-NEXT:    neg.f32 %r5, %r4;
-; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f32_ninf(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .b32 %r<7>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_param_0];
-; NORMAL-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_param_1];
-; NORMAL-NEXT:    div.rn.f32 %r3, %r1, %r2;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; NORMAL-NEXT:    neg.f32 %r5, %r4;
-; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %r6;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f32_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_param_1];
+; CHECK-NEXT:    div.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %r = frem ninf float %a, %b
   ret float %r
 }
 
+define float @frem_f32_ninf_fast(float %a, float %b) {
+; CHECK-LABEL: frem_f32_ninf_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_ninf_fast_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_f32_ninf_fast_param_1];
+; CHECK-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %r = frem  afn ninf float %a, %b
+  ret float %r
+}
+
 define double @frem_f64_ninf(double %a, double %b) {
-; FAST-LABEL: frem_f64_ninf(
-; FAST:       {
-; FAST-NEXT:    .reg .b64 %rd<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_param_0];
-; FAST-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_param_1];
-; FAST-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
-; FAST-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
-; FAST-NEXT:    neg.f64 %rd5, %rd4;
-; FAST-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
-; FAST-NEXT:    st.param.b64 [func_retval0], %rd6;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f64_ninf(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .b64 %rd<7>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_param_0];
-; NORMAL-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_param_1];
-; NORMAL-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
-; NORMAL-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
-; NORMAL-NEXT:    neg.f64 %rd5, %rd4;
-; NORMAL-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
-; NORMAL-NEXT:    st.param.b64 [func_retval0], %rd6;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f64_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %r = frem ninf double %a, %b
   ret double %r
 }
 
+define double @frem_f64_ninf_fast(double %a, double %b) {
+; CHECK-LABEL: frem_f64_ninf_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_ninf_fast_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_ninf_fast_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf double %a, %b
+  ret double %r
+}
+
 define float @frem_f32_imm1_fast(float %a) {
-; FAST-LABEL: frem_f32_imm1_fast(
-; FAST:       {
-; FAST-NEXT:    .reg .b32 %r<5>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_fast_param_0];
-; FAST-NEXT:    mul.f32 %r2, %r1, 0f3E124925;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
-; FAST-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f32_imm1_fast(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .b32 %r<5>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_fast_param_0];
-; NORMAL-NEXT:    mul.rn.f32 %r2, %r1, 0f3E124925;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
-; NORMAL-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %r4;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f32_imm1_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_fast_param_0];
+; CHECK-NEXT:    mul.rn.f32 %r2, %r1, 0f3E124925;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %r = frem arcp float %a, 7.0
   ret float %r
 }
 define float @frem_f32_imm1_normal(float %a) {
-; FAST-LABEL: frem_f32_imm1_normal(
-; FAST:       {
-; FAST-NEXT:    .reg .b32 %r<5>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_normal_param_0];
-; FAST-NEXT:    div.approx.f32 %r2, %r1, 0f40E00000;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
-; FAST-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
-; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f32_imm1_normal(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .b32 %r<5>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_normal_param_0];
-; NORMAL-NEXT:    div.rn.f32 %r2, %r1, 0f40E00000;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
-; NORMAL-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %r4;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f32_imm1_normal(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_normal_param_0];
+; CHECK-NEXT:    div.rn.f32 %r2, %r1, 0f40E00000;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %r = frem float %a, 7.0
   ret float %r
 }
 
 define float @frem_f32_imm2(float %a) {
-; FAST-LABEL: frem_f32_imm2(
-; FAST:       {
-; FAST-NEXT:    .reg .b32 %r<7>;
-; FAST-EMPTY:
-; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_param_0];
-; FAST-NEXT:    mov.b32 %r2, 0f40E00000;
-; FAST-NEXT:    div.approx.f32 %r3, %r2, %r1;
-; FAST-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; FAST-NEXT:    neg.f32 %r5, %r4;
-; FAST-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
-; FAST-NEXT:    st.param.b32 [func_retval0], %r6;
-; FAST-NEXT:    ret;
-;
-; NORMAL-LABEL: frem_f32_imm2(
-; NORMAL:       {
-; NORMAL-NEXT:    .reg .pred %p<2>;
-; NORMAL-NEXT:    .reg .b32 %r<8>;
-; NORMAL-EMPTY:
-; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_param_0];
-; NORMAL-NEXT:    mov.b32 %r2, 0f40E00000;
-; NORMAL-NEXT:    div.rn.f32 %r3, %r2, %r1;
-; NORMAL-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
-; NORMAL-NEXT:    neg.f32 %r5, %r4;
-; NORMAL-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
-; NORMAL-NEXT:    testp.infinite.f32 %p1, %r1;
-; NORMAL-NEXT:    selp.f32 %r7, 0f40E00000, %r6, %p1;
-; NORMAL-NEXT:    st.param.b32 [func_retval0], %r7;
-; NORMAL-NEXT:    ret;
+; CHECK-LABEL: frem_f32_imm2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_param_0];
+; CHECK-NEXT:    mov.b32 %r2, 0f40E00000;
+; CHECK-NEXT:    div.rn.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r1;
+; CHECK-NEXT:    selp.f32 %r7, 0f40E00000, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %r = frem float 7.0, %a
   ret float %r
 }
+
+define float @frem_f32_imm2_fast(float %a) {
+; CHECK-LABEL: frem_f32_imm2_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_f32_imm2_fast_param_0];
+; CHECK-NEXT:    mov.b32 %r2, 0f40E00000;
+; CHECK-NEXT:    div.approx.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r1, 0f40E00000;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %r = frem afn ninf float 7.0, %a
+  ret float %r
+}
diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index 3989c8e3..7e4e701 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -13,7 +13,7 @@ declare double @llvm.sqrt.f64(double)
 
 ; -- reciprocal sqrt --
 
-define float @test_rsqrt32(float %a) #0 {
+define float @test_rsqrt32(float %a) {
 ; CHECK-LABEL: test_rsqrt32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -28,7 +28,7 @@ define float @test_rsqrt32(float %a) #0 {
   ret float %ret
 }
 
-define float @test_rsqrt_ftz(float %a) #0 #1 {
+define float @test_rsqrt_ftz(float %a) #1 {
 ; CHECK-LABEL: test_rsqrt_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -76,7 +76,7 @@ define double @test_rsqrt64_ftz(double %a) #1 {
 
 ; -- sqrt --
 
-define float @test_sqrt32(float %a) #0 {
+define float @test_sqrt32(float %a) {
 ; CHECK-LABEL: test_sqrt32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -90,7 +90,7 @@ define float @test_sqrt32(float %a) #0 {
   ret float %ret
 }
 
-define float @test_sqrt32_ninf(float %a) #0 {
+define float @test_sqrt32_ninf(float %a) {
 ; CHECK-LABEL: test_sqrt32_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -108,7 +108,7 @@ define float @test_sqrt32_ninf(float %a) #0 {
   ret float %ret
 }
 
-define float @test_sqrt_ftz(float %a) #0 #1 {
+define float @test_sqrt_ftz(float %a) #1 {
 ; CHECK-LABEL: test_sqrt_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -122,7 +122,7 @@ define float @test_sqrt_ftz(float %a) #0 #1 {
   ret float %ret
 }
 
-define float @test_sqrt_ftz_ninf(float %a) #0 #1 {
+define float @test_sqrt_ftz_ninf(float %a) #1 {
 ; CHECK-LABEL: test_sqrt_ftz_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -139,7 +139,7 @@ define float @test_sqrt_ftz_ninf(float %a) #0 #1 {
   ret float %ret
 }
 
-define double @test_sqrt64(double %a) #0 {
+define double @test_sqrt64(double %a) {
 ; CHECK-LABEL: test_sqrt64(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -156,7 +156,7 @@ define double @test_sqrt64(double %a) #0 {
 ; There's no sqrt.approx.f64 instruction; we emit
 ; reciprocal(rsqrt.approx.f64(x)).  There's no non-ftz approximate reciprocal,
 ; so we just use the ftz version.
-define double @test_sqrt64_ninf(double %a) #0 {
+define double @test_sqrt64_ninf(double %a) {
 ; CHECK-LABEL: test_sqrt64_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -175,7 +175,7 @@ define double @test_sqrt64_ninf(double %a) #0 {
   ret double %ret
 }
 
-define double @test_sqrt64_ftz(double %a) #0 #1 {
+define double @test_sqrt64_ftz(double %a) #1 {
 ; CHECK-LABEL: test_sqrt64_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -190,7 +190,7 @@ define double @test_sqrt64_ftz(double %a) #0 #1 {
 }
 
 ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
+define double @test_sqrt64_ftz_ninf(double %a) #1 {
 ; CHECK-LABEL: test_sqrt64_ftz_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -214,7 +214,7 @@ define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed
 ; by some math.
 
-define float @test_rsqrt32_refined(float %a) #0 #2 {
+define float @test_rsqrt32_refined(float %a) #2 {
 ; CHECK-LABEL: test_rsqrt32_refined(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -229,11 +229,11 @@ define float @test_rsqrt32_refined(float %a) #0 #2 {
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv arcp float 1.0, %val
+  %ret = fdiv arcp contract float 1.0, %val
   ret float %ret
 }
 
-define float @test_sqrt32_refined(float %a) #0 #2 {
+define float @test_sqrt32_refined(float %a) #2 {
 ; CHECK-LABEL: test_sqrt32_refined(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -247,7 +247,7 @@ define float @test_sqrt32_refined(float %a) #0 #2 {
   ret float %ret
 }
 
-define float @test_sqrt32_refined_ninf(float %a) #0 #2 {
+define float @test_sqrt32_refined_ninf(float %a) #2 {
 ; CHECK-LABEL: test_sqrt32_refined_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -265,11 +265,11 @@ define float @test_sqrt32_refined_ninf(float %a) #0 #2 {
 ; CHECK-NEXT:    selp.f32 %r8, 0f00000000, %r6, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
-  %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
+  %ret = tail call ninf afn contract float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-define double @test_rsqrt64_refined(double %a) #0 #2 {
+define double @test_rsqrt64_refined(double %a) #2 {
 ; CHECK-LABEL: test_rsqrt64_refined(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
@@ -284,11 +284,11 @@ define double @test_rsqrt64_refined(double %a) #0 #2 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv arcp double 1.0, %val
+  %ret = fdiv arcp contract double 1.0, %val
   ret double %ret
 }
 
-define double @test_sqrt64_refined(double %a) #0 #2 {
+define double @test_sqrt64_refined(double %a) #2 {
 ; CHECK-LABEL: test_sqrt64_refined(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -302,7 +302,7 @@ define double @test_sqrt64_refined(double %a) #0 #2 {
   ret double %ret
 }
 
-define double @test_sqrt64_refined_ninf(double %a) #0 #2 {
+define double @test_sqrt64_refined_ninf(double %a) #2 {
 ; CHECK-LABEL: test_sqrt64_refined_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -320,13 +320,13 @@ define double @test_sqrt64_refined_ninf(double %a) #0 #2 {
 ; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
 ; CHECK-NEXT:    ret;
-  %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
+  %ret = tail call ninf afn contract double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
 ; -- refined sqrt and rsqrt with ftz enabled --
 
-define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
+define float @test_rsqrt32_refined_ftz(float %a) #1 #2 {
 ; CHECK-LABEL: test_rsqrt32_refined_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -341,11 +341,11 @@ define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv arcp float 1.0, %val
+  %ret = fdiv arcp contract float 1.0, %val
   ret float %ret
 }
 
-define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
+define float @test_sqrt32_refined_ftz(float %a) #1 #2 {
 ; CHECK-LABEL: test_sqrt32_refined_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -359,7 +359,7 @@ define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
   ret float %ret
 }
 
-define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 {
+define float @test_sqrt32_refined_ftz_ninf(float %a) #1 #2 {
 ; CHECK-LABEL: test_sqrt32_refined_ftz_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -376,12 +376,12 @@ define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 {
 ; CHECK-NEXT:    selp.f32 %r7, 0f00000000, %r6, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
-  %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
+  %ret = tail call ninf afn contract float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
 ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version.
-define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
+define double @test_rsqrt64_refined_ftz(double %a) #1 #2 {
 ; CHECK-LABEL: test_rsqrt64_refined_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
@@ -396,11 +396,11 @@ define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv arcp double 1.0, %val
+  %ret = fdiv arcp contract double 1.0, %val
   ret double %ret
 }
 
-define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
+define double @test_sqrt64_refined_ftz(double %a) #1 #2 {
 ; CHECK-LABEL: test_sqrt64_refined_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -414,7 +414,7 @@ define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
   ret double %ret
 }
 
-define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 {
+define double @test_sqrt64_refined_ftz_ninf(double %a) #1 #2 {
 ; CHECK-LABEL: test_sqrt64_refined_ftz_ninf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<2>;
@@ -432,10 +432,9 @@ define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 {
 ; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
 ; CHECK-NEXT:    ret;
-  %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
+  %ret = tail call ninf afn contract double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "denormal-fp-math-f32" = "preserve-sign,preserve-sign" }
 attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" }
diff --git a/llvm/test/CodeGen/PowerPC/memintr32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll
index 4f0a996..a2af6d4 100644
--- a/llvm/test/CodeGen/PowerPC/memintr32.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode32.ll
@@ -35,5 +35,37 @@ entry:
 
 declare i32 @memcmp(ptr noundef captures(none), ptr noundef captures(none), i32 noundef) nounwind
 
+define i32 @strlen_test(ptr noundef %str) nounwind {
+; CHECK-AIX-32-P9-LABEL: strlen_test:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mflr r0
+; CHECK-AIX-32-P9-NEXT:    stwu r1, -64(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r0, 72(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r3, 60(r1)
+; CHECK-AIX-32-P9-NEXT:    bl .strlen[PR]
+; CHECK-AIX-32-P9-NEXT:    nop
+; CHECK-AIX-32-P9-NEXT:    addi r1, r1, 64
+; CHECK-AIX-32-P9-NEXT:    lwz r0, 8(r1)
+; CHECK-AIX-32-P9-NEXT:    mtlr r0
+; CHECK-AIX-32-P9-NEXT:    blr
+;
+; CHECK-LINUX32-P9-LABEL: strlen_test:
+; CHECK-LINUX32-P9:       # %bb.0: # %entry
+; CHECK-LINUX32-P9-NEXT:    mflr r0
+; CHECK-LINUX32-P9-NEXT:    stwu r1, -16(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r0, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r3, 12(r1)
+; CHECK-LINUX32-P9-NEXT:    bl strlen
+; CHECK-LINUX32-P9-NEXT:    lwz r0, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    addi r1, r1, 16
+; CHECK-LINUX32-P9-NEXT:    mtlr r0
+; CHECK-LINUX32-P9-NEXT:    blr
+entry:
+  %str.addr = alloca ptr, align 4
+  store ptr %str, ptr %str.addr, align 4
+  %0 = load ptr, ptr %str.addr, align 4
+  %call = call i32 @strlen(ptr noundef %0)
+  ret i32 %call
+}
 
-
+declare i32 @strlen(ptr noundef) nounwind
diff --git a/llvm/test/CodeGen/PowerPC/memintr64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll
index 0b0e556..0f0585d9 100644
--- a/llvm/test/CodeGen/PowerPC/memintr64.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode64.ll
@@ -52,4 +52,51 @@ entry:
 
 declare i32 @memcmp(ptr noundef captures(none), ptr noundef captures(none), i64 noundef) nounwind
 
+define i64 @strlen_test(ptr noundef %str) nounwind {
+; CHECK-LE-P9-LABEL: strlen_test:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mflr r0
+; CHECK-LE-P9-NEXT:    stdu r1, -48(r1)
+; CHECK-LE-P9-NEXT:    std r0, 64(r1)
+; CHECK-LE-P9-NEXT:    std r3, 40(r1)
+; CHECK-LE-P9-NEXT:    bl strlen
+; CHECK-LE-P9-NEXT:    nop
+; CHECK-LE-P9-NEXT:    addi r1, r1, 48
+; CHECK-LE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-LE-P9-NEXT:    mtlr r0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: strlen_test:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mflr r0
+; CHECK-BE-P9-NEXT:    stdu r1, -128(r1)
+; CHECK-BE-P9-NEXT:    std r0, 144(r1)
+; CHECK-BE-P9-NEXT:    std r3, 120(r1)
+; CHECK-BE-P9-NEXT:    bl strlen
+; CHECK-BE-P9-NEXT:    nop
+; CHECK-BE-P9-NEXT:    addi r1, r1, 128
+; CHECK-BE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-BE-P9-NEXT:    mtlr r0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: strlen_test:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mflr r0
+; CHECK-AIX-64-P9-NEXT:    stdu r1, -128(r1)
+; CHECK-AIX-64-P9-NEXT:    std r0, 144(r1)
+; CHECK-AIX-64-P9-NEXT:    std r3, 120(r1)
+; CHECK-AIX-64-P9-NEXT:    bl .strlen[PR]
+; CHECK-AIX-64-P9-NEXT:    nop
+; CHECK-AIX-64-P9-NEXT:    addi r1, r1, 128
+; CHECK-AIX-64-P9-NEXT:    ld r0, 16(r1)
+; CHECK-AIX-64-P9-NEXT:    mtlr r0
+; CHECK-AIX-64-P9-NEXT:    blr
+entry:
+  %str.addr = alloca ptr, align 8
+  store ptr %str, ptr %str.addr, align 8
+  %0 = load ptr, ptr %str.addr, align 8
+  %call = call i64 @strlen(ptr noundef %0)
+  ret i64 %call
+}
 
+declare i64 @strlen(ptr noundef) nounwind
diff --git a/llvm/test/CodeGen/PowerPC/nofpclass.ll b/llvm/test/CodeGen/PowerPC/nofpclass.ll
new file mode 100644
index 0000000..b08e810
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/nofpclass.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-ibm-aix-xcoff < %s | FileCheck %s
+
+; TODO: Update this test after adding the proper expansion of nofpclass for
+; ppc_fp128 to test with more masks and to demonstrate preserving nofpclass
+; after legalization.
+
+define ppc_fp128 @f(ppc_fp128 nofpclass(nan) %s) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  ret ppc_fp128 %s
+}
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
index c733a01..4b03278 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
@@ -30,16 +30,14 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    std r0, 16(r1)
 ; CHECK-NEXT:    stw r12, 8(r1)
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset r29, -24
 ; CHECK-NEXT:    .cfi_offset r30, -16
 ; CHECK-NEXT:    .cfi_offset cr2, 8
 ; CHECK-NEXT:    .cfi_offset cr3, 8
 ; CHECK-NEXT:    .cfi_offset cr4, 8
-; CHECK-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r30, 32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    bl call_2@notoc
 ; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_13
 ; CHECK-NEXT:  # %bb.1: # %bb
@@ -67,11 +65,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-NEXT:  # %bb.6: # %bb32
 ; CHECK-NEXT:    #
+; CHECK-NEXT:    rlwinm r30, r30, 0, 24, 22
 ; CHECK-NEXT:    andi. r3, r30, 2
-; CHECK-NEXT:    rlwinm r29, r30, 0, 24, 22
 ; CHECK-NEXT:    mcrf cr2, cr0
 ; CHECK-NEXT:    bl call_4@notoc
-; CHECK-NEXT:    mr r30, r29
 ; CHECK-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-NEXT:  # %bb.7: # %bb37
 ; CHECK-NEXT:  .LBB0_8: # %bb22
@@ -92,13 +89,11 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    stdu r1, -144(r1)
 ; CHECK-BE-NEXT:    .cfi_def_cfa_offset 144
 ; CHECK-BE-NEXT:    .cfi_offset lr, 16
-; CHECK-BE-NEXT:    .cfi_offset r28, -32
 ; CHECK-BE-NEXT:    .cfi_offset r29, -24
 ; CHECK-BE-NEXT:    .cfi_offset r30, -16
 ; CHECK-BE-NEXT:    .cfi_offset cr2, 8
 ; CHECK-BE-NEXT:    .cfi_offset cr2, 8
 ; CHECK-BE-NEXT:    .cfi_offset cr2, 8
-; CHECK-BE-NEXT:    std r28, 112(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    std r29, 120(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    std r30, 128(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    bl call_2
@@ -131,12 +126,11 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-BE-NEXT:  # %bb.6: # %bb32
 ; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 24, 22
 ; CHECK-BE-NEXT:    andi. r3, r29, 2
-; CHECK-BE-NEXT:    rlwinm r28, r29, 0, 24, 22
 ; CHECK-BE-NEXT:    mcrf cr2, cr0
 ; CHECK-BE-NEXT:    bl call_4
 ; CHECK-BE-NEXT:    nop
-; CHECK-BE-NEXT:    mr r29, r28
 ; CHECK-BE-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-BE-NEXT:  # %bb.7: # %bb37
 ; CHECK-BE-NEXT:  .LBB0_8: # %bb22
diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-1.ll b/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
index f3e3410..5d5445f 100644
--- a/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
+++ b/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
@@ -187,34 +187,34 @@ define void @foo() {
 ; CHECK-P9-NEXT:    .p2align 4
 ; CHECK-P9-NEXT:  .LBB0_1: # %vector.body
 ; CHECK-P9-NEXT:    #
-; CHECK-P9-NEXT:    lxv 2, -32(6)
-; CHECK-P9-NEXT:    lxv 3, -32(5)
-; CHECK-P9-NEXT:    lxv 4, -16(5)
-; CHECK-P9-NEXT:    vadduwm 2, 3, 2
+; CHECK-P9-NEXT:    lxv 2, -32(3)
 ; CHECK-P9-NEXT:    lxv 3, -32(4)
+; CHECK-P9-NEXT:    lxv 4, -16(4)
+; CHECK-P9-NEXT:    vadduwm 2, 3, 2
+; CHECK-P9-NEXT:    lxv 3, -32(5)
 ; CHECK-P9-NEXT:    vmuluwm 2, 2, 3
-; CHECK-P9-NEXT:    lxv 3, -16(6)
-; CHECK-P9-NEXT:    vadduwm 3, 4, 3
-; CHECK-P9-NEXT:    lxv 4, 0(5)
-; CHECK-P9-NEXT:    stxv 2, -32(3)
-; CHECK-P9-NEXT:    lxv 2, -16(4)
-; CHECK-P9-NEXT:    vmuluwm 2, 3, 2
-; CHECK-P9-NEXT:    lxv 3, 0(6)
+; CHECK-P9-NEXT:    lxv 3, -16(3)
 ; CHECK-P9-NEXT:    vadduwm 3, 4, 3
-; CHECK-P9-NEXT:    lxv 4, 16(5)
-; CHECK-P9-NEXT:    addi 5, 5, 64
-; CHECK-P9-NEXT:    stxv 2, -16(3)
-; CHECK-P9-NEXT:    lxv 2, 0(4)
+; CHECK-P9-NEXT:    lxv 4, 0(4)
+; CHECK-P9-NEXT:    stxv 2, -32(6)
+; CHECK-P9-NEXT:    lxv 2, -16(5)
 ; CHECK-P9-NEXT:    vmuluwm 2, 3, 2
-; CHECK-P9-NEXT:    lxv 3, 16(6)
-; CHECK-P9-NEXT:    addi 6, 6, 64
+; CHECK-P9-NEXT:    lxv 3, 0(3)
 ; CHECK-P9-NEXT:    vadduwm 3, 4, 3
-; CHECK-P9-NEXT:    stxv 2, 0(3)
-; CHECK-P9-NEXT:    lxv 2, 16(4)
+; CHECK-P9-NEXT:    lxv 4, 16(4)
 ; CHECK-P9-NEXT:    addi 4, 4, 64
+; CHECK-P9-NEXT:    stxv 2, -16(6)
+; CHECK-P9-NEXT:    lxv 2, 0(5)
 ; CHECK-P9-NEXT:    vmuluwm 2, 3, 2
-; CHECK-P9-NEXT:    stxv 2, 16(3)
+; CHECK-P9-NEXT:    lxv 3, 16(3)
 ; CHECK-P9-NEXT:    addi 3, 3, 64
+; CHECK-P9-NEXT:    vadduwm 3, 4, 3
+; CHECK-P9-NEXT:    stxv 2, 0(6)
+; CHECK-P9-NEXT:    lxv 2, 16(5)
+; CHECK-P9-NEXT:    addi 5, 5, 64
+; CHECK-P9-NEXT:    vmuluwm 2, 3, 2
+; CHECK-P9-NEXT:    stxv 2, 16(6)
+; CHECK-P9-NEXT:    addi 6, 6, 64
 ; CHECK-P9-NEXT:    bdnz .LBB0_1
 ; CHECK-P9-NEXT:  # %bb.2: # %for.end
 ; CHECK-P9-NEXT:    blr
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-fcmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-fcmp.ll
index dfa76a2..9ec8c32 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-fcmp.ll
@@ -138,7 +138,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -148,8 +148,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srliw a0, a0, 31
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -446,7 +445,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -456,8 +455,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srliw a0, a0, 31
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-fcmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-fcmp.ll
index 475b67b..380751c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-fcmp.ll
@@ -138,7 +138,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -148,8 +148,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srliw a0, a0, 31
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -431,7 +430,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -441,8 +440,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srliw a0, a0, 31
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index f032756e..c7938a7 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -37,7 +37,7 @@ define i64 @slti(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    beqz a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a1, 0
+; RV32I-NEXT:    srli a0, a1, 31
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB1_2:
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 4efc224..551d886 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -12,7 +12,7 @@ define i1 @sadd(i32 %a, i32 %b, ptr %c) nounwind {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    add a3, a0, a1
 ; RV32I-NEXT:    slt a0, a3, a0
-; RV32I-NEXT:    slti a1, a1, 0
+; RV32I-NEXT:    srli a1, a1, 31
 ; RV32I-NEXT:    xor a0, a1, a0
 ; RV32I-NEXT:    sw a3, 0(a2)
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index fa6892b..95c577f 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -187,14 +187,14 @@ define i64 @bittest_31_i64(i64 %a) nounwind {
 ;
 ; RV64ZBS-LABEL: bittest_31_i64:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    not a0, a0
-; RV64ZBS-NEXT:    bexti a0, a0, 31
+; RV64ZBS-NEXT:    srliw a0, a0, 31
+; RV64ZBS-NEXT:    xori a0, a0, 1
 ; RV64ZBS-NEXT:    ret
 ;
 ; RV64XTHEADBS-LABEL: bittest_31_i64:
 ; RV64XTHEADBS:       # %bb.0:
-; RV64XTHEADBS-NEXT:    not a0, a0
-; RV64XTHEADBS-NEXT:    th.tst a0, a0, 31
+; RV64XTHEADBS-NEXT:    srliw a0, a0, 31
+; RV64XTHEADBS-NEXT:    xori a0, a0, 1
 ; RV64XTHEADBS-NEXT:    ret
   %shr = lshr i64 %a, 31
   %not = xor i64 %shr, -1
@@ -3507,3 +3507,77 @@ define void @bit_64_1_nz_branch_i64(i64 %0) {
 5:
   ret void
 }
+
+define i32 @bittest_31_andeq0_i64(i64 %x) {
+; RV32-LABEL: bittest_31_andeq0_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bittest_31_andeq0_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a0, a0, 31
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+  %and = and i64 %x, 2147483648
+  %cmp = icmp eq i64 %and, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @bittest_63_andeq0_i64(i64 %x) {
+; RV32-LABEL: bittest_63_andeq0_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    xori a0, a1, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bittest_63_andeq0_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+  %and = and i64 %x, 9223372036854775808
+  %cmp = icmp eq i64 %and, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @bittest_31_slt0_i32(i32 %x, i1 %y) {
+; RV32-LABEL: bittest_31_slt0_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bittest_31_slt0_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a0, a0, 31
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %cmp = icmp slt i32 %x, 0
+  %and = and i1 %cmp, %y
+  %ext = zext i1 %and to i32
+  ret i32 %ext
+}
+
+define i32 @bittest_63_slt0_i64(i32 %x, i1 %y) {
+; RV32-LABEL: bittest_63_slt0_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bittest_63_slt0_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a0, a0, 31
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %ext = sext i32 %x to i64
+  %cmp = icmp slt i64 %ext, 0
+  %and = and i1 %cmp, %y
+  %cond = zext i1 %and to i32
+  ret i32 %cond
+}
diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll
index dc81c13..91052bc 100644
--- a/llvm/test/CodeGen/RISCV/condbinops.ll
+++ b/llvm/test/CodeGen/RISCV/condbinops.ll
@@ -459,7 +459,7 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
 ; RV32ZICOND-NEXT:    sll a1, a1, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srli a4, a4, 31
 ; RV32ZICOND-NEXT:    srl a2, a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a3, a0, a4
 ; RV32ZICOND-NEXT:    or a1, a1, a2
@@ -534,7 +534,7 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srli a4, a4, 31
 ; RV32ZICOND-NEXT:    sll a2, a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
@@ -610,7 +610,7 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srli a4, a4, 31
 ; RV32ZICOND-NEXT:    sll a2, a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index a2e6186..9c81bc28 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -405,7 +405,7 @@ define i32 @fcvt_wu_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s3, a0, -1
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
@@ -446,8 +446,8 @@ define i32 @fcvt_wu_d_sat(double %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB6_3: # %start
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -819,7 +819,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    call __unorddf2
 ; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    slti a1, s4, 0
+; RV32I-NEXT:    srli a1, s4, 31
 ; RV32I-NEXT:    sgtz a2, s2
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    addi a3, a1, -1
@@ -1029,7 +1029,7 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s3, a0, -1
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
@@ -1055,7 +1055,7 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s1, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunsdfdi
@@ -1898,9 +1898,9 @@ define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind {
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB28_3
 ; RV32I-NEXT:  .LBB28_2:
-; RV32I-NEXT:    slti a2, s0, 0
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    srli s0, s0, 31
+; RV32I-NEXT:    addi s0, s0, -1
+; RV32I-NEXT:    and a0, s0, a0
 ; RV32I-NEXT:  .LBB28_3: # %start
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -1937,8 +1937,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB28_3
 ; RV64I-NEXT:  .LBB28_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB28_3: # %start
 ; RV64I-NEXT:    and a0, a0, a1
@@ -2271,9 +2271,9 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind {
 ; RV32I-NEXT:    li a0, 255
 ; RV32I-NEXT:    j .LBB32_3
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    slti a1, s0, 0
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    srli s0, s0, 31
+; RV32I-NEXT:    addi s0, s0, -1
+; RV32I-NEXT:    and a0, s0, a0
 ; RV32I-NEXT:  .LBB32_3: # %start
 ; RV32I-NEXT:    zext.b a0, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -2307,8 +2307,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind {
 ; RV64I-NEXT:    li a0, 255
 ; RV64I-NEXT:    j .LBB32_3
 ; RV64I-NEXT:  .LBB32_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB32_3: # %start
 ; RV64I-NEXT:    zext.b a0, a0
@@ -2386,7 +2386,7 @@ define zeroext i32 @fcvt_wu_d_sat_zext(double %a) nounwind {
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    li a3, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s3, a0, -1
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
@@ -2427,8 +2427,8 @@ define zeroext i32 @fcvt_wu_d_sat_zext(double %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB33_3
 ; RV64I-NEXT:  .LBB33_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB33_3: # %start
 ; RV64I-NEXT:    slli a0, a0, 32
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
index 7c5332f..b1c63af 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll
@@ -140,7 +140,7 @@ define i32 @fcmp_oge(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -151,7 +151,7 @@ define i32 @fcmp_oge(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -193,7 +193,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -203,7 +203,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -605,7 +605,7 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -616,7 +616,7 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -661,7 +661,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -671,7 +671,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -934,7 +934,7 @@ define i32 @fcmps_oge(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -945,7 +945,7 @@ define i32 @fcmps_oge(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -976,7 +976,7 @@ define i32 @fcmps_olt(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -986,7 +986,7 @@ define i32 @fcmps_olt(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -1311,7 +1311,7 @@ define i32 @fcmps_uge(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -1322,7 +1322,7 @@ define i32 @fcmps_uge(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -1356,7 +1356,7 @@ define i32 @fcmps_ult(double %a, double %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1366,7 +1366,7 @@ define i32 @fcmps_ult(double %a, double %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-fcmp.ll b/llvm/test/CodeGen/RISCV/double-fcmp.ll
index f73e686..31c8589 100644
--- a/llvm/test/CodeGen/RISCV/double-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-fcmp.ll
@@ -138,7 +138,7 @@ define i32 @fcmp_oge(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -149,7 +149,7 @@ define i32 @fcmp_oge(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -180,7 +180,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -190,7 +190,7 @@ define i32 @fcmp_olt(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -515,7 +515,7 @@ define i32 @fcmp_uge(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltdf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -526,7 +526,7 @@ define i32 @fcmp_uge(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltdf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -560,7 +560,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -570,7 +570,7 @@ define i32 @fcmp_ult(double %a, double %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 60349a0..6e49d47 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -278,7 +278,7 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
@@ -320,8 +320,8 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB4_3
 ; RV64I-NEXT:  .LBB4_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB4_3: # %start
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -736,7 +736,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    slti a1, s2, 0
+; RV32I-NEXT:    srli a1, s2, 31
 ; RV32I-NEXT:    sgtz a2, s4
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    addi a3, a1, -1
@@ -932,7 +932,7 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
@@ -971,7 +971,7 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s2, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunssfdi
@@ -1651,8 +1651,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind {
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB26_3
 ; RV32I-NEXT:  .LBB26_2:
-; RV32I-NEXT:    slti a0, s0, 0
-; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    srli s0, s0, 31
+; RV32I-NEXT:    addi a0, s0, -1
 ; RV32I-NEXT:    and a0, a0, s1
 ; RV32I-NEXT:  .LBB26_3: # %start
 ; RV32I-NEXT:    and a0, a0, a1
@@ -1688,8 +1688,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB26_3
 ; RV64I-NEXT:  .LBB26_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB26_3: # %start
 ; RV64I-NEXT:    and a0, a0, a1
@@ -1986,8 +1986,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV32I-NEXT:    li a0, 255
 ; RV32I-NEXT:    j .LBB30_3
 ; RV32I-NEXT:  .LBB30_2:
-; RV32I-NEXT:    slti a0, s0, 0
-; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    srli s0, s0, 31
+; RV32I-NEXT:    addi a0, s0, -1
 ; RV32I-NEXT:    and a0, a0, s1
 ; RV32I-NEXT:  .LBB30_3: # %start
 ; RV32I-NEXT:    zext.b a0, a0
@@ -2020,8 +2020,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV64I-NEXT:    li a0, 255
 ; RV64I-NEXT:    j .LBB30_3
 ; RV64I-NEXT:  .LBB30_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB30_3: # %start
 ; RV64I-NEXT:    zext.b a0, a0
@@ -2087,7 +2087,7 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
 ; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
@@ -2129,8 +2129,8 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB31_3
 ; RV64I-NEXT:  .LBB31_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB31_3: # %start
 ; RV64I-NEXT:    slli a0, a0, 32
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
index fd3baa0..7cdd182 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll
@@ -117,7 +117,7 @@ define i32 @fcmp_oge(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -128,7 +128,7 @@ define i32 @fcmp_oge(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -161,7 +161,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -171,7 +171,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -492,7 +492,7 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -503,7 +503,7 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -538,7 +538,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -548,7 +548,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -770,7 +770,7 @@ define i32 @fcmps_oge(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -781,7 +781,7 @@ define i32 @fcmps_oge(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -807,7 +807,7 @@ define i32 @fcmps_olt(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -817,7 +817,7 @@ define i32 @fcmps_olt(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -1087,7 +1087,7 @@ define i32 @fcmps_uge(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -1098,7 +1098,7 @@ define i32 @fcmps_uge(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -1126,7 +1126,7 @@ define i32 @fcmps_ult(float %a, float %b) nounwind strictfp {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1136,7 +1136,7 @@ define i32 @fcmps_ult(float %a, float %b) nounwind strictfp {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-fcmp.ll b/llvm/test/CodeGen/RISCV/float-fcmp.ll
index 2e9c39f..cec6580 100644
--- a/llvm/test/CodeGen/RISCV/float-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/float-fcmp.ll
@@ -123,7 +123,7 @@ define i32 @fcmp_oge(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -134,7 +134,7 @@ define i32 @fcmp_oge(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -160,7 +160,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -170,7 +170,7 @@ define i32 @fcmp_olt(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
@@ -440,7 +440,7 @@ define i32 @fcmp_uge(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __ltsf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
@@ -451,7 +451,7 @@ define i32 @fcmp_uge(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __ltsf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -479,7 +479,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -489,7 +489,7 @@ define i32 @fcmp_ult(float %a, float %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
index ed50042..8b8a325 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
@@ -1634,7 +1634,7 @@ define i1 @fpclass(float %x) {
 ; RV32I-NEXT:    add a4, a5, a4
 ; RV32I-NEXT:    addi a5, a5, -1
 ; RV32I-NEXT:    sltu a2, a5, a2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    seqz a5, a6
 ; RV32I-NEXT:    srli a4, a4, 24
@@ -1660,8 +1660,7 @@ define i1 @fpclass(float %x) {
 ; RV64I-NEXT:    add a4, a5, a4
 ; RV64I-NEXT:    addi a5, a5, -1
 ; RV64I-NEXT:    sltu a2, a5, a2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srliw a0, a0, 31
 ; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    seqz a5, a6
 ; RV64I-NEXT:    srliw a4, a4, 24
@@ -2092,19 +2091,18 @@ define i1 @isnegfinite_fpclass(float %x) {
 ; RV32I-NEXT:    lui a2, 522240
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    slt a1, a1, a2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: isnegfinite_fpclass:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a1, a0
-; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    slli a1, a0, 33
 ; RV64I-NEXT:    lui a2, 522240
-; RV64I-NEXT:    srli a0, a0, 33
-; RV64I-NEXT:    slt a0, a0, a2
-; RV64I-NEXT:    slti a1, a1, 0
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a1, a1, 33
+; RV64I-NEXT:    slt a1, a1, a2
+; RV64I-NEXT:    srliw a0, a0, 31
+; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 56)  ; 0x38 = "-finite"
   ret i1 %1
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 477a7d1..aa65ebe 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -909,7 +909,7 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-NEXT:    mv s2, a2
 ; RV32I-NEXT:    beqz a3, .LBB20_3
 ; RV32I-NEXT:  # %bb.1: # %entry
-; RV32I-NEXT:    slti a1, s1, 0
+; RV32I-NEXT:    srli a1, s1, 31
 ; RV32I-NEXT:    beqz a1, .LBB20_4
 ; RV32I-NEXT:  .LBB20_2:
 ; RV32I-NEXT:    li s3, 0
@@ -974,7 +974,7 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-MEDIUM-NEXT:    mv s2, a2
 ; RV32I-MEDIUM-NEXT:    beqz a3, .LBB20_3
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %entry
-; RV32I-MEDIUM-NEXT:    slti a1, s1, 0
+; RV32I-MEDIUM-NEXT:    srli a1, s1, 31
 ; RV32I-MEDIUM-NEXT:    beqz a1, .LBB20_4
 ; RV32I-MEDIUM-NEXT:  .LBB20_2:
 ; RV32I-MEDIUM-NEXT:    li s3, 0
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index e7719dc..1a69106 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3475,7 +3475,7 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    beqz a1, .LBB50_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB50_2 Depth=1
-; RV32-NEXT:    slti a0, a1, 0
+; RV32-NEXT:    srli a0, a1, 31
 ; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    bnez a0, .LBB50_1
 ; RV32-NEXT:    j .LBB50_5
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 519f1e8..18d071c 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -22,7 +22,7 @@ define i32 @stest_f64i32(double %x) {
 ; RV32IF-NEXT:    addi a3, a2, -1
 ; RV32IF-NEXT:    beqz a1, .LBB0_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a1, 0
+; RV32IF-NEXT:    srli a4, a1, 31
 ; RV32IF-NEXT:    j .LBB0_3
 ; RV32IF-NEXT:  .LBB0_2:
 ; RV32IF-NEXT:    sltu a4, a0, a3
@@ -36,7 +36,7 @@ define i32 @stest_f64i32(double %x) {
 ; RV32IF-NEXT:    li a3, -1
 ; RV32IF-NEXT:    beq a1, a3, .LBB0_7
 ; RV32IF-NEXT:  # %bb.6: # %entry
-; RV32IF-NEXT:    slti a1, a1, 0
+; RV32IF-NEXT:    srli a1, a1, 31
 ; RV32IF-NEXT:    xori a1, a1, 1
 ; RV32IF-NEXT:    beqz a1, .LBB0_8
 ; RV32IF-NEXT:    j .LBB0_9
@@ -185,7 +185,7 @@ define i32 @ustest_f64i32(double %x) {
 ; RV32IF-NEXT:    call __fixdfdi
 ; RV32IF-NEXT:    beqz a1, .LBB2_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a2, a1, 0
+; RV32IF-NEXT:    srli a2, a1, 31
 ; RV32IF-NEXT:    j .LBB2_3
 ; RV32IF-NEXT:  .LBB2_2:
 ; RV32IF-NEXT:    sltiu a2, a0, -1
@@ -373,7 +373,7 @@ define i32 @stest_f16i32(half %x) {
 ; RV32-NEXT:    addi a3, a2, -1
 ; RV32-NEXT:    beqz a1, .LBB6_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a1, 0
+; RV32-NEXT:    srli a4, a1, 31
 ; RV32-NEXT:    j .LBB6_3
 ; RV32-NEXT:  .LBB6_2:
 ; RV32-NEXT:    sltu a4, a0, a3
@@ -387,7 +387,7 @@ define i32 @stest_f16i32(half %x) {
 ; RV32-NEXT:    li a3, -1
 ; RV32-NEXT:    beq a1, a3, .LBB6_7
 ; RV32-NEXT:  # %bb.6: # %entry
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    xori a1, a1, 1
 ; RV32-NEXT:    beqz a1, .LBB6_8
 ; RV32-NEXT:    j .LBB6_9
@@ -494,7 +494,7 @@ define i32 @ustest_f16i32(half %x) {
 ; RV32-NEXT:    call __fixsfdi
 ; RV32-NEXT:    beqz a1, .LBB8_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a2, a1, 0
+; RV32-NEXT:    srli a2, a1, 31
 ; RV32-NEXT:    j .LBB8_3
 ; RV32-NEXT:  .LBB8_2:
 ; RV32-NEXT:    sltiu a2, a0, -1
@@ -1108,7 +1108,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB18_4
 ; RV32IF-NEXT:  .LBB18_3: # %entry
-; RV32IF-NEXT:    slti a6, a4, 0
+; RV32IF-NEXT:    srli a6, a4, 31
 ; RV32IF-NEXT:  .LBB18_4: # %entry
 ; RV32IF-NEXT:    neg a7, a6
 ; RV32IF-NEXT:    addi t0, a6, -1
@@ -1130,8 +1130,8 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IF-NEXT:    li a5, -1
 ; RV32IF-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a4, 0
-; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    srli a4, a4, 31
+; RV32IF-NEXT:    xori a0, a4, 1
 ; RV32IF-NEXT:  .LBB18_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB18_13
 ; RV32IF-NEXT:  # %bb.12: # %entry
@@ -1156,7 +1156,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV64IF-NEXT:    srli a3, a2, 1
 ; RV64IF-NEXT:    beqz a1, .LBB18_2
 ; RV64IF-NEXT:  # %bb.1: # %entry
-; RV64IF-NEXT:    slti a4, a1, 0
+; RV64IF-NEXT:    srli a4, a1, 63
 ; RV64IF-NEXT:    j .LBB18_3
 ; RV64IF-NEXT:  .LBB18_2:
 ; RV64IF-NEXT:    sltu a4, a0, a3
@@ -1170,8 +1170,8 @@ define i64 @stest_f64i64(double %x) {
 ; RV64IF-NEXT:    slli a1, a2, 63
 ; RV64IF-NEXT:    beq a5, a2, .LBB18_7
 ; RV64IF-NEXT:  # %bb.6: # %entry
-; RV64IF-NEXT:    slti a2, a5, 0
-; RV64IF-NEXT:    xori a2, a2, 1
+; RV64IF-NEXT:    srli a5, a5, 63
+; RV64IF-NEXT:    xori a2, a5, 1
 ; RV64IF-NEXT:    beqz a2, .LBB18_8
 ; RV64IF-NEXT:    j .LBB18_9
 ; RV64IF-NEXT:  .LBB18_7:
@@ -1211,7 +1211,7 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB18_4
 ; RV32IFD-NEXT:  .LBB18_3: # %entry
-; RV32IFD-NEXT:    slti a6, a4, 0
+; RV32IFD-NEXT:    srli a6, a4, 31
 ; RV32IFD-NEXT:  .LBB18_4: # %entry
 ; RV32IFD-NEXT:    neg a7, a6
 ; RV32IFD-NEXT:    addi t0, a6, -1
@@ -1233,8 +1233,8 @@ define i64 @stest_f64i64(double %x) {
 ; RV32IFD-NEXT:    li a5, -1
 ; RV32IFD-NEXT:    beq a2, a5, .LBB18_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a4, 0
-; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    srli a4, a4, 31
+; RV32IFD-NEXT:    xori a0, a4, 1
 ; RV32IFD-NEXT:  .LBB18_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB18_13
 ; RV32IFD-NEXT:  # %bb.12: # %entry
@@ -1363,7 +1363,7 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IF-NEXT:    lw a0, 16(sp)
 ; RV32IF-NEXT:    beqz a1, .LBB20_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a2, a1, 0
+; RV32IF-NEXT:    srli a2, a1, 31
 ; RV32IF-NEXT:    j .LBB20_3
 ; RV32IF-NEXT:  .LBB20_2:
 ; RV32IF-NEXT:    seqz a2, a0
@@ -1446,7 +1446,7 @@ define i64 @ustest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lw a0, 16(sp)
 ; RV32IFD-NEXT:    beqz a1, .LBB20_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a2, a1, 0
+; RV32IFD-NEXT:    srli a2, a1, 31
 ; RV32IFD-NEXT:    j .LBB20_3
 ; RV32IFD-NEXT:  .LBB20_2:
 ; RV32IFD-NEXT:    seqz a2, a0
@@ -1523,7 +1523,7 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB21_4
 ; RV32-NEXT:  .LBB21_3: # %entry
-; RV32-NEXT:    slti a6, a4, 0
+; RV32-NEXT:    srli a6, a4, 31
 ; RV32-NEXT:  .LBB21_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -1545,8 +1545,8 @@ define i64 @stest_f32i64(float %x) {
 ; RV32-NEXT:    li a5, -1
 ; RV32-NEXT:    beq a2, a5, .LBB21_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a4, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    xori a0, a4, 1
 ; RV32-NEXT:  .LBB21_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB21_13
 ; RV32-NEXT:  # %bb.12: # %entry
@@ -1643,7 +1643,7 @@ define i64 @ustest_f32i64(float %x) {
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    beqz a1, .LBB23_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a2, a1, 0
+; RV32-NEXT:    srli a2, a1, 31
 ; RV32-NEXT:    j .LBB23_3
 ; RV32-NEXT:  .LBB23_2:
 ; RV32-NEXT:    seqz a2, a0
@@ -1750,7 +1750,7 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB24_4
 ; RV32-NEXT:  .LBB24_3: # %entry
-; RV32-NEXT:    slti a6, a4, 0
+; RV32-NEXT:    srli a6, a4, 31
 ; RV32-NEXT:  .LBB24_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -1772,8 +1772,8 @@ define i64 @stest_f16i64(half %x) {
 ; RV32-NEXT:    li a5, -1
 ; RV32-NEXT:    beq a2, a5, .LBB24_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a4, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    xori a0, a4, 1
 ; RV32-NEXT:  .LBB24_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB24_13
 ; RV32-NEXT:  # %bb.12: # %entry
@@ -1799,7 +1799,7 @@ define i64 @stest_f16i64(half %x) {
 ; RV64-NEXT:    srli a3, a2, 1
 ; RV64-NEXT:    beqz a1, .LBB24_2
 ; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    slti a4, a1, 0
+; RV64-NEXT:    srli a4, a1, 63
 ; RV64-NEXT:    j .LBB24_3
 ; RV64-NEXT:  .LBB24_2:
 ; RV64-NEXT:    sltu a4, a0, a3
@@ -1813,8 +1813,8 @@ define i64 @stest_f16i64(half %x) {
 ; RV64-NEXT:    slli a1, a2, 63
 ; RV64-NEXT:    beq a5, a2, .LBB24_7
 ; RV64-NEXT:  # %bb.6: # %entry
-; RV64-NEXT:    slti a2, a5, 0
-; RV64-NEXT:    xori a2, a2, 1
+; RV64-NEXT:    srli a5, a5, 63
+; RV64-NEXT:    xori a2, a5, 1
 ; RV64-NEXT:    beqz a2, .LBB24_8
 ; RV64-NEXT:    j .LBB24_9
 ; RV64-NEXT:  .LBB24_7:
@@ -1906,7 +1906,7 @@ define i64 @ustest_f16i64(half %x) {
 ; RV32-NEXT:    lw a0, 16(sp)
 ; RV32-NEXT:    beqz a1, .LBB26_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a2, a1, 0
+; RV32-NEXT:    srli a2, a1, 31
 ; RV32-NEXT:    j .LBB26_3
 ; RV32-NEXT:  .LBB26_2:
 ; RV32-NEXT:    seqz a2, a0
@@ -2004,7 +2004,7 @@ define i32 @stest_f64i32_mm(double %x) {
 ; RV32IF-NEXT:    addi a3, a2, -1
 ; RV32IF-NEXT:    beqz a1, .LBB27_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a1, 0
+; RV32IF-NEXT:    srli a4, a1, 31
 ; RV32IF-NEXT:    j .LBB27_3
 ; RV32IF-NEXT:  .LBB27_2:
 ; RV32IF-NEXT:    sltu a4, a0, a3
@@ -2018,7 +2018,7 @@ define i32 @stest_f64i32_mm(double %x) {
 ; RV32IF-NEXT:    li a3, -1
 ; RV32IF-NEXT:    beq a1, a3, .LBB27_7
 ; RV32IF-NEXT:  # %bb.6: # %entry
-; RV32IF-NEXT:    slti a1, a1, 0
+; RV32IF-NEXT:    srli a1, a1, 31
 ; RV32IF-NEXT:    xori a1, a1, 1
 ; RV32IF-NEXT:    beqz a1, .LBB27_8
 ; RV32IF-NEXT:    j .LBB27_9
@@ -2171,7 +2171,7 @@ define i32 @ustest_f64i32_mm(double %x) {
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a3, a0
 ; RV32IF-NEXT:    and a1, a2, a1
-; RV32IF-NEXT:    slti a1, a1, 0
+; RV32IF-NEXT:    srli a1, a1, 31
 ; RV32IF-NEXT:    addi a1, a1, -1
 ; RV32IF-NEXT:    and a0, a1, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2337,7 +2337,7 @@ define i32 @stest_f16i32_mm(half %x) {
 ; RV32-NEXT:    addi a3, a2, -1
 ; RV32-NEXT:    beqz a1, .LBB33_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a1, 0
+; RV32-NEXT:    srli a4, a1, 31
 ; RV32-NEXT:    j .LBB33_3
 ; RV32-NEXT:  .LBB33_2:
 ; RV32-NEXT:    sltu a4, a0, a3
@@ -2351,7 +2351,7 @@ define i32 @stest_f16i32_mm(half %x) {
 ; RV32-NEXT:    li a3, -1
 ; RV32-NEXT:    beq a1, a3, .LBB33_7
 ; RV32-NEXT:  # %bb.6: # %entry
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    xori a1, a1, 1
 ; RV32-NEXT:    beqz a1, .LBB33_8
 ; RV32-NEXT:    j .LBB33_9
@@ -2462,7 +2462,7 @@ define i32 @ustest_f16i32_mm(half %x) {
 ; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3044,7 +3044,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    or a7, a2, a4
 ; RV32IF-NEXT:    beqz a7, .LBB45_4
 ; RV32IF-NEXT:  .LBB45_3: # %entry
-; RV32IF-NEXT:    slti a6, a4, 0
+; RV32IF-NEXT:    srli a6, a4, 31
 ; RV32IF-NEXT:  .LBB45_4: # %entry
 ; RV32IF-NEXT:    neg a7, a6
 ; RV32IF-NEXT:    addi t0, a6, -1
@@ -3066,8 +3066,8 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    li a5, -1
 ; RV32IF-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IF-NEXT:  # %bb.10: # %entry
-; RV32IF-NEXT:    slti a0, a4, 0
-; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    srli a4, a4, 31
+; RV32IF-NEXT:    xori a0, a4, 1
 ; RV32IF-NEXT:  .LBB45_11: # %entry
 ; RV32IF-NEXT:    bnez a0, .LBB45_13
 ; RV32IF-NEXT:  # %bb.12: # %entry
@@ -3092,7 +3092,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV64IF-NEXT:    srli a3, a2, 1
 ; RV64IF-NEXT:    beqz a1, .LBB45_2
 ; RV64IF-NEXT:  # %bb.1: # %entry
-; RV64IF-NEXT:    slti a4, a1, 0
+; RV64IF-NEXT:    srli a4, a1, 63
 ; RV64IF-NEXT:    j .LBB45_3
 ; RV64IF-NEXT:  .LBB45_2:
 ; RV64IF-NEXT:    sltu a4, a0, a3
@@ -3106,8 +3106,8 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV64IF-NEXT:    slli a1, a2, 63
 ; RV64IF-NEXT:    beq a5, a2, .LBB45_7
 ; RV64IF-NEXT:  # %bb.6: # %entry
-; RV64IF-NEXT:    slti a2, a5, 0
-; RV64IF-NEXT:    xori a2, a2, 1
+; RV64IF-NEXT:    srli a5, a5, 63
+; RV64IF-NEXT:    xori a2, a5, 1
 ; RV64IF-NEXT:    beqz a2, .LBB45_8
 ; RV64IF-NEXT:    j .LBB45_9
 ; RV64IF-NEXT:  .LBB45_7:
@@ -3147,7 +3147,7 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    or a7, a2, a4
 ; RV32IFD-NEXT:    beqz a7, .LBB45_4
 ; RV32IFD-NEXT:  .LBB45_3: # %entry
-; RV32IFD-NEXT:    slti a6, a4, 0
+; RV32IFD-NEXT:    srli a6, a4, 31
 ; RV32IFD-NEXT:  .LBB45_4: # %entry
 ; RV32IFD-NEXT:    neg a7, a6
 ; RV32IFD-NEXT:    addi t0, a6, -1
@@ -3169,8 +3169,8 @@ define i64 @stest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    li a5, -1
 ; RV32IFD-NEXT:    beq a2, a5, .LBB45_11
 ; RV32IFD-NEXT:  # %bb.10: # %entry
-; RV32IFD-NEXT:    slti a0, a4, 0
-; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    srli a4, a4, 31
+; RV32IFD-NEXT:    xori a0, a4, 1
 ; RV32IFD-NEXT:  .LBB45_11: # %entry
 ; RV32IFD-NEXT:    bnez a0, .LBB45_13
 ; RV32IFD-NEXT:  # %bb.12: # %entry
@@ -3298,7 +3298,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lw a3, 16(sp)
 ; RV32IF-NEXT:    beqz a2, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a2, 0
+; RV32IF-NEXT:    srli a4, a2, 31
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
@@ -3312,7 +3312,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    slti a2, a2, 0
+; RV32IF-NEXT:    srli a2, a2, 31
 ; RV32IF-NEXT:    addi a2, a2, -1
 ; RV32IF-NEXT:    and a0, a2, a0
 ; RV32IF-NEXT:    and a1, a2, a1
@@ -3335,7 +3335,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB47_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
-; RV64-NEXT:    slti a2, a2, 0
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    addi a2, a2, -1
@@ -3360,7 +3360,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lw a3, 16(sp)
 ; RV32IFD-NEXT:    beqz a2, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a2, 0
+; RV32IFD-NEXT:    srli a4, a2, 31
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
@@ -3374,7 +3374,7 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
 ; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    slti a2, a2, 0
+; RV32IFD-NEXT:    srli a2, a2, 31
 ; RV32IFD-NEXT:    addi a2, a2, -1
 ; RV32IFD-NEXT:    and a0, a2, a0
 ; RV32IFD-NEXT:    and a1, a2, a1
@@ -3417,7 +3417,7 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB48_4
 ; RV32-NEXT:  .LBB48_3: # %entry
-; RV32-NEXT:    slti a6, a4, 0
+; RV32-NEXT:    srli a6, a4, 31
 ; RV32-NEXT:  .LBB48_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -3439,8 +3439,8 @@ define i64 @stest_f32i64_mm(float %x) {
 ; RV32-NEXT:    li a5, -1
 ; RV32-NEXT:    beq a2, a5, .LBB48_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a4, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    xori a0, a4, 1
 ; RV32-NEXT:  .LBB48_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB48_13
 ; RV32-NEXT:  # %bb.12: # %entry
@@ -3536,7 +3536,7 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    beqz a2, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    srli a4, a2, 31
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
@@ -3550,7 +3550,7 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    and a1, a2, a1
@@ -3573,7 +3573,7 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB50_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
-; RV64-NEXT:    slti a2, a2, 0
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    addi a2, a2, -1
@@ -3618,7 +3618,7 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    or a7, a2, a4
 ; RV32-NEXT:    beqz a7, .LBB51_4
 ; RV32-NEXT:  .LBB51_3: # %entry
-; RV32-NEXT:    slti a6, a4, 0
+; RV32-NEXT:    srli a6, a4, 31
 ; RV32-NEXT:  .LBB51_4: # %entry
 ; RV32-NEXT:    neg a7, a6
 ; RV32-NEXT:    addi t0, a6, -1
@@ -3640,8 +3640,8 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV32-NEXT:    li a5, -1
 ; RV32-NEXT:    beq a2, a5, .LBB51_11
 ; RV32-NEXT:  # %bb.10: # %entry
-; RV32-NEXT:    slti a0, a4, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    xori a0, a4, 1
 ; RV32-NEXT:  .LBB51_11: # %entry
 ; RV32-NEXT:    bnez a0, .LBB51_13
 ; RV32-NEXT:  # %bb.12: # %entry
@@ -3667,7 +3667,7 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV64-NEXT:    srli a3, a2, 1
 ; RV64-NEXT:    beqz a1, .LBB51_2
 ; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    slti a4, a1, 0
+; RV64-NEXT:    srli a4, a1, 63
 ; RV64-NEXT:    j .LBB51_3
 ; RV64-NEXT:  .LBB51_2:
 ; RV64-NEXT:    sltu a4, a0, a3
@@ -3681,8 +3681,8 @@ define i64 @stest_f16i64_mm(half %x) {
 ; RV64-NEXT:    slli a1, a2, 63
 ; RV64-NEXT:    beq a5, a2, .LBB51_7
 ; RV64-NEXT:  # %bb.6: # %entry
-; RV64-NEXT:    slti a2, a5, 0
-; RV64-NEXT:    xori a2, a2, 1
+; RV64-NEXT:    srli a5, a5, 63
+; RV64-NEXT:    xori a2, a5, 1
 ; RV64-NEXT:    beqz a2, .LBB51_8
 ; RV64-NEXT:    j .LBB51_9
 ; RV64-NEXT:  .LBB51_7:
@@ -3773,7 +3773,7 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    lw a3, 16(sp)
 ; RV32-NEXT:    beqz a2, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    srli a4, a2, 31
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
@@ -3787,7 +3787,7 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
 ; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a0, a2, a0
 ; RV32-NEXT:    and a1, a2, a1
@@ -3811,7 +3811,7 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB53_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
-; RV64-NEXT:    slti a2, a2, 0
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    addi a2, a2, -1
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 0c152e6..961c6cd 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -818,7 +818,7 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    bgtz a0, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, s2, 0
+; RV32I-NEXT:    srli a0, s2, 31
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and s0, a0, s1
 ; RV32I-NEXT:  .LBB3_2: # %start
@@ -856,7 +856,7 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64I-NEXT:    call __gtsf2
 ; RV64I-NEXT:    bgtz a0, .LBB3_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, s2, 0
+; RV64I-NEXT:    srli a0, s2, 63
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and s0, a0, s1
 ; RV64I-NEXT:  .LBB3_2: # %start
@@ -1788,7 +1788,7 @@ define i32 @fcvt_wu_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
@@ -1828,8 +1828,8 @@ define i32 @fcvt_wu_h_sat(half %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB8_3: # %start
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2369,13 +2369,13 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    sgtz a1, s4
-; RV32I-NEXT:    slti a2, s0, 0
+; RV32I-NEXT:    srli s0, s0, 31
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    neg a3, a1
-; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    addi s0, s0, -1
 ; RV32I-NEXT:    and a1, a0, s3
-; RV32I-NEXT:    and a2, a2, s1
-; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    and s0, s0, s1
+; RV32I-NEXT:    or a2, a2, s0
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -3051,7 +3051,7 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
@@ -3085,7 +3085,7 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s2, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunssfdi
@@ -6912,8 +6912,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    j .LBB34_3
 ; RV32I-NEXT:  .LBB34_2:
-; RV32I-NEXT:    slti a0, s1, 0
-; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    srli s1, s1, 31
+; RV32I-NEXT:    addi a0, s1, -1
 ; RV32I-NEXT:    and a0, a0, s0
 ; RV32I-NEXT:  .LBB34_3: # %start
 ; RV32I-NEXT:    and a0, a0, s3
@@ -6953,8 +6953,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64I-NEXT:    mv a0, s3
 ; RV64I-NEXT:    j .LBB34_3
 ; RV64I-NEXT:  .LBB34_2:
-; RV64I-NEXT:    slti a0, s1, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s1, s1, 63
+; RV64I-NEXT:    addi a0, s1, -1
 ; RV64I-NEXT:    and a0, a0, s0
 ; RV64I-NEXT:  .LBB34_3: # %start
 ; RV64I-NEXT:    and a0, a0, s3
@@ -7856,8 +7856,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32I-NEXT:    li a0, 255
 ; RV32I-NEXT:    j .LBB38_3
 ; RV32I-NEXT:  .LBB38_2:
-; RV32I-NEXT:    slti a0, s0, 0
-; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    srli s0, s0, 31
+; RV32I-NEXT:    addi a0, s0, -1
 ; RV32I-NEXT:    and a0, a0, s1
 ; RV32I-NEXT:  .LBB38_3: # %start
 ; RV32I-NEXT:    zext.b a0, a0
@@ -7893,8 +7893,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64I-NEXT:    li a0, 255
 ; RV64I-NEXT:    j .LBB38_3
 ; RV64I-NEXT:  .LBB38_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB38_3: # %start
 ; RV64I-NEXT:    zext.b a0, a0
@@ -8130,7 +8130,7 @@ define zeroext i32 @fcvt_wu_h_sat_zext(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
@@ -8170,8 +8170,8 @@ define zeroext i32 @fcvt_wu_h_sat_zext(half %a) nounwind {
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    j .LBB39_3
 ; RV64I-NEXT:  .LBB39_2:
-; RV64I-NEXT:    slti a0, s0, 0
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    srli s0, s0, 63
+; RV64I-NEXT:    addi a0, s0, -1
 ; RV64I-NEXT:    and a0, a0, s1
 ; RV64I-NEXT:  .LBB39_3: # %start
 ; RV64I-NEXT:    slli a0, a0, 32
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index cd93579..afc8e35 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2741,7 +2741,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -2763,7 +2763,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -2785,7 +2785,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -2807,7 +2807,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -5549,7 +5549,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -5571,7 +5571,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -5593,7 +5593,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -5615,7 +5615,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -5637,7 +5637,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
@@ -5699,7 +5699,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index a5bdb13..c737edb 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -3161,7 +3161,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -3183,7 +3183,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -3205,7 +3205,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -3227,7 +3227,7 @@ define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -3454,7 +3454,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-NEXT:    call bcmp
-; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
@@ -3466,7 +3466,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
@@ -3478,7 +3478,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
@@ -3490,7 +3490,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
@@ -3502,7 +3502,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
@@ -3514,7 +3514,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
@@ -3526,7 +3526,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
-; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
@@ -3538,7 +3538,7 @@ define i1 @bcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
@@ -6839,7 +6839,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -6861,7 +6861,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -6883,7 +6883,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -6905,7 +6905,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -6927,7 +6927,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
@@ -6989,7 +6989,7 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
@@ -7366,7 +7366,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-NEXT:    call memcmp
-; CHECK-ALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
@@ -7378,7 +7378,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
@@ -7390,7 +7390,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
@@ -7402,7 +7402,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
@@ -7414,7 +7414,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
@@ -7426,7 +7426,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
@@ -7438,7 +7438,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
-; CHECK-ALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV32-V-NEXT:    srli a0, a0, 31
 ; CHECK-ALIGNED-RV32-V-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
@@ -7450,7 +7450,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-ALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-ALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-ALIGNED-RV64-V-NEXT:    xori a0, a0, 1
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
@@ -7462,7 +7462,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-UNALIGNED-RV32-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV32-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-NEXT:    srli a0, a0, 31
 ; CHECK-UNALIGNED-RV32-NEXT:    xori a0, a0, 1
 ; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
@@ -7474,7 +7474,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-NEXT:    xori a0, a0, 1
 ; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
@@ -7530,7 +7530,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV32-V-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV32-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV32-V-NEXT:    srli a0, a0, 31
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xori a0, a0, 1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 16
@@ -7542,7 +7542,7 @@ define i1 @memcmp_ge_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV64-V-NEXT:    slti a0, a0, 0
+; CHECK-UNALIGNED-RV64-V-NEXT:    srli a0, a0, 63
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xori a0, a0, 1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll
index 0115b48..acde8ad 100644
--- a/llvm/test/CodeGen/RISCV/min-max.ll
+++ b/llvm/test/CodeGen/RISCV/min-max.ll
@@ -642,7 +642,7 @@ define signext i32 @smin_i32_negone(i32 signext %a) {
 define i64 @smin_i64_negone(i64 %a) {
 ; RV32I-LABEL: smin_i64_negone:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slti a2, a1, 0
+; RV32I-NEXT:    srli a2, a1, 31
 ; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    or a0, a2, a0
 ; RV32I-NEXT:    slti a2, a1, -1
@@ -661,7 +661,7 @@ define i64 @smin_i64_negone(i64 %a) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    li a2, -1
 ; RV32ZBB-NEXT:    min a2, a1, a2
-; RV32ZBB-NEXT:    slti a1, a1, 0
+; RV32ZBB-NEXT:    srli a1, a1, 31
 ; RV32ZBB-NEXT:    addi a1, a1, -1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    mv a1, a2
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index 30a9355..f846736 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -39,7 +39,7 @@ define i1 @pr85190(i64 %a) {
 ; CHECK-NOZBB-LABEL: pr85190:
 ; CHECK-NOZBB:       # %bb.0:
 ; CHECK-NOZBB-NEXT:    ori a1, a0, 7
-; CHECK-NOZBB-NEXT:    slti a2, a0, 0
+; CHECK-NOZBB-NEXT:    srli a2, a0, 63
 ; CHECK-NOZBB-NEXT:    li a3, -1
 ; CHECK-NOZBB-NEXT:    slli a3, a3, 63
 ; CHECK-NOZBB-NEXT:    sub a3, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
index 1736074..7ab3d7c 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
@@ -431,7 +431,7 @@ define i64 @not_shl_one_i64(i64 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, -32
 ; CHECK-NEXT:    li a2, 1
-; CHECK-NEXT:    slti a1, a1, 0
+; CHECK-NEXT:    srli a1, a1, 31
 ; CHECK-NEXT:    sll a0, a2, a0
 ; CHECK-NEXT:    neg a2, a1
 ; CHECK-NEXT:    addi a1, a1, -1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index e3728bf..dcb70f8 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -53,11 +53,11 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi a5, a3, -32
 ; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    sll a3, a4, a3
-; RV32I-NEXT:    slti a4, a5, 0
-; RV32I-NEXT:    neg a5, a4
-; RV32I-NEXT:    addi a4, a4, -1
-; RV32I-NEXT:    and a2, a5, a2
-; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    srli a5, a5, 31
+; RV32I-NEXT:    neg a4, a5
+; RV32I-NEXT:    addi a5, a5, -1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    and a3, a5, a3
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    not a3, a3
 ; RV32I-NEXT:    and a0, a2, a0
@@ -70,7 +70,7 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSNOZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSNOZBB-NEXT:    addi a4, a3, -32
 ; RV32ZBSNOZBB-NEXT:    bset a3, zero, a3
-; RV32ZBSNOZBB-NEXT:    slti a4, a4, 0
+; RV32ZBSNOZBB-NEXT:    srli a4, a4, 31
 ; RV32ZBSNOZBB-NEXT:    neg a5, a4
 ; RV32ZBSNOZBB-NEXT:    addi a4, a4, -1
 ; RV32ZBSNOZBB-NEXT:    and a2, a5, a2
@@ -87,7 +87,7 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSZBB-NEXT:    bset a4, zero, a3
 ; RV32ZBSZBB-NEXT:    addi a3, a3, -32
-; RV32ZBSZBB-NEXT:    slti a3, a3, 0
+; RV32ZBSZBB-NEXT:    srli a3, a3, 31
 ; RV32ZBSZBB-NEXT:    addi a5, a3, -1
 ; RV32ZBSZBB-NEXT:    neg a3, a3
 ; RV32ZBSZBB-NEXT:    and a4, a5, a4
@@ -188,7 +188,7 @@ define signext i64 @bset_i64_zero(i64 signext %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a1, a0, -32
 ; RV32I-NEXT:    li a2, 1
-; RV32I-NEXT:    slti a1, a1, 0
+; RV32I-NEXT:    srli a1, a1, 31
 ; RV32I-NEXT:    sll a2, a2, a0
 ; RV32I-NEXT:    neg a0, a1
 ; RV32I-NEXT:    addi a1, a1, -1
@@ -200,11 +200,11 @@ define signext i64 @bset_i64_zero(i64 signext %a) nounwind {
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    addi a1, a0, -32
 ; RV32ZBS-NEXT:    bset a2, zero, a0
-; RV32ZBS-NEXT:    slti a0, a1, 0
-; RV32ZBS-NEXT:    neg a1, a0
-; RV32ZBS-NEXT:    addi a3, a0, -1
-; RV32ZBS-NEXT:    and a0, a1, a2
-; RV32ZBS-NEXT:    and a1, a3, a2
+; RV32ZBS-NEXT:    srli a1, a1, 31
+; RV32ZBS-NEXT:    neg a0, a1
+; RV32ZBS-NEXT:    addi a1, a1, -1
+; RV32ZBS-NEXT:    and a0, a0, a2
+; RV32ZBS-NEXT:    and a1, a1, a2
 ; RV32ZBS-NEXT:    ret
   %shl = shl i64 1, %a
   ret i64 %shl
diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
index dd49d9e..caa6c2f 100644
--- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
@@ -97,7 +97,7 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __unorddf2
 ; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    slti a1, s2, 0
+; RV64I-NEXT:    srli a1, s2, 63
 ; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    addi a3, a1, -1
@@ -207,7 +207,7 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s2, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunsdfti
diff --git a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
index 896e371..ebda785 100644
--- a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
@@ -95,7 +95,7 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    slti a1, s2, 0
+; RV64I-NEXT:    srli a1, s2, 63
 ; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    addi a3, a1, -1
@@ -209,7 +209,7 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s2, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunssfti
diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
index f89d1abf..648f378 100644
--- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
@@ -173,13 +173,13 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    sgtz a1, s4
-; RV64I-NEXT:    slti a2, s0, 0
+; RV64I-NEXT:    srli s0, s0, 63
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    neg a3, a1
-; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    neg a2, a1
+; RV64I-NEXT:    addi s0, s0, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    and a2, a2, s1
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    and s0, s0, s1
+; RV64I-NEXT:    or a2, a2, s0
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -288,7 +288,7 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    call __gesf2
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi s2, a0, -1
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixunssfti
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 9ef7f94..aba9d37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2327,7 +2327,7 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB18_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    bnez s1, .LBB18_4
 ; CHECK-NOV-NEXT:  .LBB18_2:
 ; CHECK-NOV-NEXT:    sltu a5, s0, a3
@@ -2337,7 +2337,7 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    sltu a4, a2, a3
 ; CHECK-NOV-NEXT:    beqz s1, .LBB18_2
 ; CHECK-NOV-NEXT:  .LBB18_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, s1, 0
+; CHECK-NOV-NEXT:    srli a5, s1, 63
 ; CHECK-NOV-NEXT:    bnez a5, .LBB18_6
 ; CHECK-NOV-NEXT:  .LBB18_5: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a3
@@ -2353,8 +2353,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    slli a1, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB18_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
-; CHECK-NOV-NEXT:    slti a3, a5, 0
-; CHECK-NOV-NEXT:    xori a3, a3, 1
+; CHECK-NOV-NEXT:    srli a5, a5, 63
+; CHECK-NOV-NEXT:    xori a3, a5, 1
 ; CHECK-NOV-NEXT:    bne a4, a0, .LBB18_12
 ; CHECK-NOV-NEXT:  .LBB18_10:
 ; CHECK-NOV-NEXT:    sltu a0, a1, s0
@@ -2364,8 +2364,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    sltu a3, a1, a2
 ; CHECK-NOV-NEXT:    beq a4, a0, .LBB18_10
 ; CHECK-NOV-NEXT:  .LBB18_12: # %entry
-; CHECK-NOV-NEXT:    slti a0, a4, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a4, a4, 63
+; CHECK-NOV-NEXT:    xori a0, a4, 1
 ; CHECK-NOV-NEXT:    bnez a0, .LBB18_14
 ; CHECK-NOV-NEXT:  .LBB18_13: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a1
@@ -2415,7 +2415,7 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB18_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    bnez s1, .LBB18_4
 ; CHECK-V-NEXT:  .LBB18_2:
 ; CHECK-V-NEXT:    sltu a5, s0, a3
@@ -2425,7 +2425,7 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    sltu a4, a0, a3
 ; CHECK-V-NEXT:    beqz s1, .LBB18_2
 ; CHECK-V-NEXT:  .LBB18_4: # %entry
-; CHECK-V-NEXT:    slti a5, s1, 0
+; CHECK-V-NEXT:    srli a5, s1, 63
 ; CHECK-V-NEXT:    bnez a5, .LBB18_6
 ; CHECK-V-NEXT:  .LBB18_5: # %entry
 ; CHECK-V-NEXT:    mv s0, a3
@@ -2441,8 +2441,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB18_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
-; CHECK-V-NEXT:    slti a3, a5, 0
-; CHECK-V-NEXT:    xori a3, a3, 1
+; CHECK-V-NEXT:    srli a5, a5, 63
+; CHECK-V-NEXT:    xori a3, a5, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB18_12
 ; CHECK-V-NEXT:  .LBB18_10:
 ; CHECK-V-NEXT:    sltu a2, a1, s0
@@ -2452,8 +2452,8 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB18_10
 ; CHECK-V-NEXT:  .LBB18_12: # %entry
-; CHECK-V-NEXT:    slti a2, a4, 0
-; CHECK-V-NEXT:    xori a2, a2, 1
+; CHECK-V-NEXT:    srli a4, a4, 63
+; CHECK-V-NEXT:    xori a2, a4, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB18_14
 ; CHECK-V-NEXT:  .LBB18_13: # %entry
 ; CHECK-V-NEXT:    mv s0, a1
@@ -2749,7 +2749,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB21_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    bnez s1, .LBB21_4
 ; CHECK-NOV-NEXT:  .LBB21_2:
 ; CHECK-NOV-NEXT:    sltu a5, s0, a3
@@ -2759,7 +2759,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    sltu a4, a2, a3
 ; CHECK-NOV-NEXT:    beqz s1, .LBB21_2
 ; CHECK-NOV-NEXT:  .LBB21_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, s1, 0
+; CHECK-NOV-NEXT:    srli a5, s1, 63
 ; CHECK-NOV-NEXT:    bnez a5, .LBB21_6
 ; CHECK-NOV-NEXT:  .LBB21_5: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a3
@@ -2775,8 +2775,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    slli a1, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB21_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
-; CHECK-NOV-NEXT:    slti a3, a5, 0
-; CHECK-NOV-NEXT:    xori a3, a3, 1
+; CHECK-NOV-NEXT:    srli a5, a5, 63
+; CHECK-NOV-NEXT:    xori a3, a5, 1
 ; CHECK-NOV-NEXT:    bne a4, a0, .LBB21_12
 ; CHECK-NOV-NEXT:  .LBB21_10:
 ; CHECK-NOV-NEXT:    sltu a0, a1, s0
@@ -2786,8 +2786,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    sltu a3, a1, a2
 ; CHECK-NOV-NEXT:    beq a4, a0, .LBB21_10
 ; CHECK-NOV-NEXT:  .LBB21_12: # %entry
-; CHECK-NOV-NEXT:    slti a0, a4, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a4, a4, 63
+; CHECK-NOV-NEXT:    xori a0, a4, 1
 ; CHECK-NOV-NEXT:    bnez a0, .LBB21_14
 ; CHECK-NOV-NEXT:  .LBB21_13: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a1
@@ -2837,7 +2837,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB21_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    bnez s1, .LBB21_4
 ; CHECK-V-NEXT:  .LBB21_2:
 ; CHECK-V-NEXT:    sltu a5, s0, a3
@@ -2847,7 +2847,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    sltu a4, a0, a3
 ; CHECK-V-NEXT:    beqz s1, .LBB21_2
 ; CHECK-V-NEXT:  .LBB21_4: # %entry
-; CHECK-V-NEXT:    slti a5, s1, 0
+; CHECK-V-NEXT:    srli a5, s1, 63
 ; CHECK-V-NEXT:    bnez a5, .LBB21_6
 ; CHECK-V-NEXT:  .LBB21_5: # %entry
 ; CHECK-V-NEXT:    mv s0, a3
@@ -2863,8 +2863,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB21_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
-; CHECK-V-NEXT:    slti a3, a5, 0
-; CHECK-V-NEXT:    xori a3, a3, 1
+; CHECK-V-NEXT:    srli a5, a5, 63
+; CHECK-V-NEXT:    xori a3, a5, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB21_12
 ; CHECK-V-NEXT:  .LBB21_10:
 ; CHECK-V-NEXT:    sltu a2, a1, s0
@@ -2874,8 +2874,8 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB21_10
 ; CHECK-V-NEXT:  .LBB21_12: # %entry
-; CHECK-V-NEXT:    slti a2, a4, 0
-; CHECK-V-NEXT:    xori a2, a2, 1
+; CHECK-V-NEXT:    srli a4, a4, 63
+; CHECK-V-NEXT:    xori a2, a4, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB21_14
 ; CHECK-V-NEXT:  .LBB21_13: # %entry
 ; CHECK-V-NEXT:    mv s0, a1
@@ -3174,7 +3174,7 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB24_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    bnez s1, .LBB24_4
 ; CHECK-NOV-NEXT:  .LBB24_2:
 ; CHECK-NOV-NEXT:    sltu a5, s0, a3
@@ -3184,7 +3184,7 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    sltu a4, a2, a3
 ; CHECK-NOV-NEXT:    beqz s1, .LBB24_2
 ; CHECK-NOV-NEXT:  .LBB24_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, s1, 0
+; CHECK-NOV-NEXT:    srli a5, s1, 63
 ; CHECK-NOV-NEXT:    bnez a5, .LBB24_6
 ; CHECK-NOV-NEXT:  .LBB24_5: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a3
@@ -3200,8 +3200,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    slli a1, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB24_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
-; CHECK-NOV-NEXT:    slti a3, a5, 0
-; CHECK-NOV-NEXT:    xori a3, a3, 1
+; CHECK-NOV-NEXT:    srli a5, a5, 63
+; CHECK-NOV-NEXT:    xori a3, a5, 1
 ; CHECK-NOV-NEXT:    bne a4, a0, .LBB24_12
 ; CHECK-NOV-NEXT:  .LBB24_10:
 ; CHECK-NOV-NEXT:    sltu a0, a1, s0
@@ -3211,8 +3211,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    sltu a3, a1, a2
 ; CHECK-NOV-NEXT:    beq a4, a0, .LBB24_10
 ; CHECK-NOV-NEXT:  .LBB24_12: # %entry
-; CHECK-NOV-NEXT:    slti a0, a4, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a4, a4, 63
+; CHECK-NOV-NEXT:    xori a0, a4, 1
 ; CHECK-NOV-NEXT:    bnez a0, .LBB24_14
 ; CHECK-NOV-NEXT:  .LBB24_13: # %entry
 ; CHECK-NOV-NEXT:    mv s0, a1
@@ -3260,7 +3260,7 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB24_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    bnez s1, .LBB24_4
 ; CHECK-V-NEXT:  .LBB24_2:
 ; CHECK-V-NEXT:    sltu a5, s0, a3
@@ -3270,7 +3270,7 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    sltu a4, a0, a3
 ; CHECK-V-NEXT:    beqz s1, .LBB24_2
 ; CHECK-V-NEXT:  .LBB24_4: # %entry
-; CHECK-V-NEXT:    slti a5, s1, 0
+; CHECK-V-NEXT:    srli a5, s1, 63
 ; CHECK-V-NEXT:    bnez a5, .LBB24_6
 ; CHECK-V-NEXT:  .LBB24_5: # %entry
 ; CHECK-V-NEXT:    mv s0, a3
@@ -3286,8 +3286,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB24_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
-; CHECK-V-NEXT:    slti a3, a5, 0
-; CHECK-V-NEXT:    xori a3, a3, 1
+; CHECK-V-NEXT:    srli a5, a5, 63
+; CHECK-V-NEXT:    xori a3, a5, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB24_12
 ; CHECK-V-NEXT:  .LBB24_10:
 ; CHECK-V-NEXT:    sltu a2, a1, s0
@@ -3297,8 +3297,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB24_10
 ; CHECK-V-NEXT:  .LBB24_12: # %entry
-; CHECK-V-NEXT:    slti a2, a4, 0
-; CHECK-V-NEXT:    xori a2, a2, 1
+; CHECK-V-NEXT:    srli a4, a4, 63
+; CHECK-V-NEXT:    xori a2, a4, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB24_14
 ; CHECK-V-NEXT:  .LBB24_13: # %entry
 ; CHECK-V-NEXT:    mv s0, a1
@@ -5864,7 +5864,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB45_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    beqz a4, .LBB45_3
 ; CHECK-NOV-NEXT:    j .LBB45_4
 ; CHECK-NOV-NEXT:  .LBB45_2:
@@ -5875,7 +5875,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:  .LBB45_4: # %entry
 ; CHECK-NOV-NEXT:    beqz s1, .LBB45_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    slti a6, s1, 0
+; CHECK-NOV-NEXT:    srli a6, s1, 63
 ; CHECK-NOV-NEXT:    j .LBB45_7
 ; CHECK-NOV-NEXT:  .LBB45_6:
 ; CHECK-NOV-NEXT:    sltu a6, s0, a3
@@ -5890,7 +5890,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    slli a3, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB45_11
 ; CHECK-NOV-NEXT:  # %bb.10: # %entry
-; CHECK-NOV-NEXT:    slti a5, a5, 0
+; CHECK-NOV-NEXT:    srli a5, a5, 63
 ; CHECK-NOV-NEXT:    xori a5, a5, 1
 ; CHECK-NOV-NEXT:    and a1, a4, a1
 ; CHECK-NOV-NEXT:    beqz a5, .LBB45_12
@@ -5904,8 +5904,8 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:  .LBB45_13: # %entry
 ; CHECK-NOV-NEXT:    beq a1, a0, .LBB45_15
 ; CHECK-NOV-NEXT:  # %bb.14: # %entry
-; CHECK-NOV-NEXT:    slti a0, a1, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a1, a1, 63
+; CHECK-NOV-NEXT:    xori a0, a1, 1
 ; CHECK-NOV-NEXT:    beqz a0, .LBB45_16
 ; CHECK-NOV-NEXT:    j .LBB45_17
 ; CHECK-NOV-NEXT:  .LBB45_15:
@@ -5955,7 +5955,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB45_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    beqz a4, .LBB45_3
 ; CHECK-V-NEXT:    j .LBB45_4
 ; CHECK-V-NEXT:  .LBB45_2:
@@ -5966,7 +5966,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:  .LBB45_4: # %entry
 ; CHECK-V-NEXT:    beqz s1, .LBB45_6
 ; CHECK-V-NEXT:  # %bb.5: # %entry
-; CHECK-V-NEXT:    slti a6, s1, 0
+; CHECK-V-NEXT:    srli a6, s1, 63
 ; CHECK-V-NEXT:    j .LBB45_7
 ; CHECK-V-NEXT:  .LBB45_6:
 ; CHECK-V-NEXT:    sltu a6, s0, a3
@@ -5981,7 +5981,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    slli a3, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB45_11
 ; CHECK-V-NEXT:  # %bb.10: # %entry
-; CHECK-V-NEXT:    slti a5, a5, 0
+; CHECK-V-NEXT:    srli a5, a5, 63
 ; CHECK-V-NEXT:    xori a5, a5, 1
 ; CHECK-V-NEXT:    and a1, a4, a1
 ; CHECK-V-NEXT:    beqz a5, .LBB45_12
@@ -5995,7 +5995,7 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:  .LBB45_13: # %entry
 ; CHECK-V-NEXT:    beq a1, a2, .LBB45_15
 ; CHECK-V-NEXT:  # %bb.14: # %entry
-; CHECK-V-NEXT:    slti a1, a1, 0
+; CHECK-V-NEXT:    srli a1, a1, 63
 ; CHECK-V-NEXT:    xori a1, a1, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB45_16
 ; CHECK-V-NEXT:    j .LBB45_17
@@ -6153,8 +6153,8 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:  .LBB47_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    slti a4, s1, 1
-; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    srli a3, a3, 63
+; CHECK-NOV-NEXT:    srli a2, a2, 63
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    addi a3, a3, -1
@@ -6210,8 +6210,8 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:  .LBB47_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    slti a4, s1, 1
-; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    srli a3, a3, 63
+; CHECK-V-NEXT:    srli a2, a2, 63
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    addi a3, a3, -1
@@ -6268,7 +6268,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB48_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    beqz a4, .LBB48_3
 ; CHECK-NOV-NEXT:    j .LBB48_4
 ; CHECK-NOV-NEXT:  .LBB48_2:
@@ -6279,7 +6279,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB48_4: # %entry
 ; CHECK-NOV-NEXT:    beqz s1, .LBB48_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    slti a6, s1, 0
+; CHECK-NOV-NEXT:    srli a6, s1, 63
 ; CHECK-NOV-NEXT:    j .LBB48_7
 ; CHECK-NOV-NEXT:  .LBB48_6:
 ; CHECK-NOV-NEXT:    sltu a6, s0, a3
@@ -6294,7 +6294,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    slli a3, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB48_11
 ; CHECK-NOV-NEXT:  # %bb.10: # %entry
-; CHECK-NOV-NEXT:    slti a5, a5, 0
+; CHECK-NOV-NEXT:    srli a5, a5, 63
 ; CHECK-NOV-NEXT:    xori a5, a5, 1
 ; CHECK-NOV-NEXT:    and a1, a4, a1
 ; CHECK-NOV-NEXT:    beqz a5, .LBB48_12
@@ -6308,8 +6308,8 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB48_13: # %entry
 ; CHECK-NOV-NEXT:    beq a1, a0, .LBB48_15
 ; CHECK-NOV-NEXT:  # %bb.14: # %entry
-; CHECK-NOV-NEXT:    slti a0, a1, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a1, a1, 63
+; CHECK-NOV-NEXT:    xori a0, a1, 1
 ; CHECK-NOV-NEXT:    beqz a0, .LBB48_16
 ; CHECK-NOV-NEXT:    j .LBB48_17
 ; CHECK-NOV-NEXT:  .LBB48_15:
@@ -6359,7 +6359,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB48_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    beqz a4, .LBB48_3
 ; CHECK-V-NEXT:    j .LBB48_4
 ; CHECK-V-NEXT:  .LBB48_2:
@@ -6370,7 +6370,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:  .LBB48_4: # %entry
 ; CHECK-V-NEXT:    beqz s1, .LBB48_6
 ; CHECK-V-NEXT:  # %bb.5: # %entry
-; CHECK-V-NEXT:    slti a6, s1, 0
+; CHECK-V-NEXT:    srli a6, s1, 63
 ; CHECK-V-NEXT:    j .LBB48_7
 ; CHECK-V-NEXT:  .LBB48_6:
 ; CHECK-V-NEXT:    sltu a6, s0, a3
@@ -6385,7 +6385,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    slli a3, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB48_11
 ; CHECK-V-NEXT:  # %bb.10: # %entry
-; CHECK-V-NEXT:    slti a5, a5, 0
+; CHECK-V-NEXT:    srli a5, a5, 63
 ; CHECK-V-NEXT:    xori a5, a5, 1
 ; CHECK-V-NEXT:    and a1, a4, a1
 ; CHECK-V-NEXT:    beqz a5, .LBB48_12
@@ -6399,7 +6399,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:  .LBB48_13: # %entry
 ; CHECK-V-NEXT:    beq a1, a2, .LBB48_15
 ; CHECK-V-NEXT:  # %bb.14: # %entry
-; CHECK-V-NEXT:    slti a1, a1, 0
+; CHECK-V-NEXT:    srli a1, a1, 63
 ; CHECK-V-NEXT:    xori a1, a1, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB48_16
 ; CHECK-V-NEXT:    j .LBB48_17
@@ -6557,8 +6557,8 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:  .LBB50_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    slti a4, s1, 1
-; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    srli a3, a3, 63
+; CHECK-NOV-NEXT:    srli a2, a2, 63
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    addi a3, a3, -1
@@ -6614,8 +6614,8 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:  .LBB50_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    slti a4, s1, 1
-; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    srli a3, a3, 63
+; CHECK-V-NEXT:    srli a2, a2, 63
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    addi a3, a3, -1
@@ -6675,7 +6675,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    srli a3, a0, 1
 ; CHECK-NOV-NEXT:    beqz a1, .LBB51_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    srli a4, a1, 63
 ; CHECK-NOV-NEXT:    beqz a4, .LBB51_3
 ; CHECK-NOV-NEXT:    j .LBB51_4
 ; CHECK-NOV-NEXT:  .LBB51_2:
@@ -6686,7 +6686,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:  .LBB51_4: # %entry
 ; CHECK-NOV-NEXT:    beqz s1, .LBB51_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    slti a6, s1, 0
+; CHECK-NOV-NEXT:    srli a6, s1, 63
 ; CHECK-NOV-NEXT:    j .LBB51_7
 ; CHECK-NOV-NEXT:  .LBB51_6:
 ; CHECK-NOV-NEXT:    sltu a6, s0, a3
@@ -6701,7 +6701,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    slli a3, a0, 63
 ; CHECK-NOV-NEXT:    beq a5, a0, .LBB51_11
 ; CHECK-NOV-NEXT:  # %bb.10: # %entry
-; CHECK-NOV-NEXT:    slti a5, a5, 0
+; CHECK-NOV-NEXT:    srli a5, a5, 63
 ; CHECK-NOV-NEXT:    xori a5, a5, 1
 ; CHECK-NOV-NEXT:    and a1, a4, a1
 ; CHECK-NOV-NEXT:    beqz a5, .LBB51_12
@@ -6715,8 +6715,8 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:  .LBB51_13: # %entry
 ; CHECK-NOV-NEXT:    beq a1, a0, .LBB51_15
 ; CHECK-NOV-NEXT:  # %bb.14: # %entry
-; CHECK-NOV-NEXT:    slti a0, a1, 0
-; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    srli a1, a1, 63
+; CHECK-NOV-NEXT:    xori a0, a1, 1
 ; CHECK-NOV-NEXT:    beqz a0, .LBB51_16
 ; CHECK-NOV-NEXT:    j .LBB51_17
 ; CHECK-NOV-NEXT:  .LBB51_15:
@@ -6764,7 +6764,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    srli a3, a2, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB51_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    srli a4, a1, 63
 ; CHECK-V-NEXT:    beqz a4, .LBB51_3
 ; CHECK-V-NEXT:    j .LBB51_4
 ; CHECK-V-NEXT:  .LBB51_2:
@@ -6775,7 +6775,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:  .LBB51_4: # %entry
 ; CHECK-V-NEXT:    beqz s1, .LBB51_6
 ; CHECK-V-NEXT:  # %bb.5: # %entry
-; CHECK-V-NEXT:    slti a6, s1, 0
+; CHECK-V-NEXT:    srli a6, s1, 63
 ; CHECK-V-NEXT:    j .LBB51_7
 ; CHECK-V-NEXT:  .LBB51_6:
 ; CHECK-V-NEXT:    sltu a6, s0, a3
@@ -6790,7 +6790,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    slli a3, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB51_11
 ; CHECK-V-NEXT:  # %bb.10: # %entry
-; CHECK-V-NEXT:    slti a5, a5, 0
+; CHECK-V-NEXT:    srli a5, a5, 63
 ; CHECK-V-NEXT:    xori a5, a5, 1
 ; CHECK-V-NEXT:    and a1, a4, a1
 ; CHECK-V-NEXT:    beqz a5, .LBB51_12
@@ -6804,7 +6804,7 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:  .LBB51_13: # %entry
 ; CHECK-V-NEXT:    beq a1, a2, .LBB51_15
 ; CHECK-V-NEXT:  # %bb.14: # %entry
-; CHECK-V-NEXT:    slti a1, a1, 0
+; CHECK-V-NEXT:    srli a1, a1, 63
 ; CHECK-V-NEXT:    xori a1, a1, 1
 ; CHECK-V-NEXT:    beqz a1, .LBB51_16
 ; CHECK-V-NEXT:    j .LBB51_17
@@ -6960,8 +6960,8 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:  .LBB53_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    slti a4, s1, 1
-; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    srli a3, a3, 63
+; CHECK-NOV-NEXT:    srli a2, a2, 63
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    addi a3, a3, -1
@@ -7015,8 +7015,8 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:  .LBB53_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    slti a4, s1, 1
-; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    srli a3, a3, 63
+; CHECK-V-NEXT:    srli a2, a2, 63
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    addi a3, a3, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index d916702..e7baffd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -3438,9 +3438,8 @@ define <vscale x 4 x i32> @vbrev_v(<vscale x 4 x i32> %a, iXLen %vl) {
 define <vscale x 4 x i32> @vbrev8_v(<vscale x 4 x i32> %a, iXLen %vl) {
 ; CHECK-LABEL: vbrev8_v:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vbrev8.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vbrev8.v v10, v8
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vbrev8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %a, iXLen -1)
@@ -3451,9 +3450,8 @@ define <vscale x 4 x i32> @vbrev8_v(<vscale x 4 x i32> %a, iXLen %vl) {
 define <vscale x 4 x i32> @vrev8_v(<vscale x 4 x i32> %a, iXLen %vl) {
 ; CHECK-LABEL: vrev8_v:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vrev8.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vrev8.v v10, v8
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrev8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> %a, iXLen -1)
@@ -3560,9 +3558,8 @@ define <vscale x 4 x i32> @vrol_vx(<vscale x 4 x i32> %a, iXLen %b, iXLen %vl) {
 define <vscale x 2 x i64> @vclmul_vv(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, iXLen %vl) {
 ; CHECK-LABEL: vclmul_vv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vclmul.vv v10, v8, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vclmul.vv v10, v8, v10
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i64> @llvm.riscv.vclmul.nxv2i64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b, iXLen -1)
@@ -3573,9 +3570,8 @@ define <vscale x 2 x i64> @vclmul_vv(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @vclmul_vx(<vscale x 2 x i64> %a, i32 %b, iXLen %vl) {
 ; CHECK-LABEL: vclmul_vx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vclmul.vx v10, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vclmul.vx v10, v8, a0
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i64> @llvm.riscv.vclmul.nxv2i64.i32(<vscale x 2 x i64> undef, <vscale x 2 x i64> %a, i32 %b, iXLen -1)
@@ -3586,9 +3582,8 @@ define <vscale x 2 x i64> @vclmul_vx(<vscale x 2 x i64> %a, i32 %b, iXLen %vl) {
 define <vscale x 2 x i64> @vclmulh_vv(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, iXLen %vl) {
 ; CHECK-LABEL: vclmulh_vv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vclmulh.vv v10, v8, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vclmulh.vv v10, v8, v10
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i64> @llvm.riscv.vclmulh.nxv2i64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b, iXLen -1)
@@ -3599,9 +3594,8 @@ define <vscale x 2 x i64> @vclmulh_vv(<vscale x 2 x i64> %a, <vscale x 2 x i64>
 define <vscale x 2 x i64> @vclmulh_vx(<vscale x 2 x i64> %a, i32 %b, iXLen %vl) {
 ; CHECK-LABEL: vclmulh_vx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vclmulh.vx v10, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vclmulh.vx v10, v8, a0
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    ret
   %1 = call <vscale x 2 x i64> @llvm.riscv.vclmulh.nxv2i64.i32(<vscale x 2 x i64> undef, <vscale x 2 x i64> %a, i32 %b, iXLen -1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index 4c84304..dddcd4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -61,11 +61,11 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:    sltu t3, a4, t3
 ; RV32-NEXT:    and t3, t4, t3
 ; RV32-NEXT:    or t4, a1, a3
-; RV32-NEXT:    slti t4, t4, 0
+; RV32-NEXT:    srli t4, t4, 31
 ; RV32-NEXT:    or t4, t5, t4
 ; RV32-NEXT:    or t5, a1, a5
 ; RV32-NEXT:    sltu t1, a6, t1
-; RV32-NEXT:    slti t5, t5, 0
+; RV32-NEXT:    srli t5, t5, 31
 ; RV32-NEXT:    or t3, t3, t5
 ; RV32-NEXT:    or t3, t4, t3
 ; RV32-NEXT:    or t1, t1, t3
@@ -186,14 +186,14 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64P670-NEXT:    slli t3, t2, 1
 ; RV64P670-NEXT:    and s0, s0, s1
 ; RV64P670-NEXT:    or s1, a1, a3
-; RV64P670-NEXT:    slti s1, s1, 0
+; RV64P670-NEXT:    srli s1, s1, 63
 ; RV64P670-NEXT:    or t6, s0, s1
 ; RV64P670-NEXT:    sltu s1, a0, t5
 ; RV64P670-NEXT:    sltu s0, a4, t4
 ; RV64P670-NEXT:    mv t5, a0
 ; RV64P670-NEXT:    and s0, s0, s1
 ; RV64P670-NEXT:    or s1, a1, a5
-; RV64P670-NEXT:    slti s1, s1, 0
+; RV64P670-NEXT:    srli s1, s1, 63
 ; RV64P670-NEXT:    or s0, s0, s1
 ; RV64P670-NEXT:    li s1, 32
 ; RV64P670-NEXT:    maxu s1, t3, s1
@@ -321,12 +321,12 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    or s2, a1, a3
 ; RV64X60-NEXT:    sltu s0, a0, t5
 ; RV64X60-NEXT:    sltu s1, a4, t3
-; RV64X60-NEXT:    slti t3, s2, 0
+; RV64X60-NEXT:    srli t3, s2, 63
 ; RV64X60-NEXT:    and s0, s0, s1
 ; RV64X60-NEXT:    or s1, a1, a5
 ; RV64X60-NEXT:    or t4, t4, t3
 ; RV64X60-NEXT:    slli t3, t2, 1
-; RV64X60-NEXT:    slti s1, s1, 0
+; RV64X60-NEXT:    srli s1, s1, 63
 ; RV64X60-NEXT:    or s0, s0, s1
 ; RV64X60-NEXT:    maxu s1, t3, t6
 ; RV64X60-NEXT:    or s0, t4, s0
@@ -461,10 +461,10 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64-NEXT:    sltu t5, a4, t5
 ; RV64-NEXT:    and t5, t6, t5
 ; RV64-NEXT:    or t6, a1, a3
-; RV64-NEXT:    slti t6, t6, 0
+; RV64-NEXT:    srli t6, t6, 63
 ; RV64-NEXT:    or t6, s0, t6
 ; RV64-NEXT:    or s0, a1, a5
-; RV64-NEXT:    slti s0, s0, 0
+; RV64-NEXT:    srli s0, s0, 63
 ; RV64-NEXT:    or t5, t5, s0
 ; RV64-NEXT:    or t5, t6, t5
 ; RV64-NEXT:    sltu t4, a6, t4
diff --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll
index 04f2436..1d6d07a 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll
@@ -16,7 +16,7 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32-NEXT:    mv a2, a0
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    slt a2, a0, a2
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    beq a1, a2, .LBB0_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    srai a0, a0, 31
@@ -77,7 +77,7 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    slt a2, a0, a2
-; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    beq a1, a2, .LBB1_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    srai a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
index 857026c..9200a77 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -17,7 +17,7 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    slt a2, a0, a3
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    beq a1, a2, .LBB0_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    srai a0, a0, 31
@@ -81,7 +81,7 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    slt a1, a0, a1
-; RV64-NEXT:    slti a2, a2, 0
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    beq a2, a1, .LBB1_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    srai a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
index 325e4b5..8ab66ba 100644
--- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll
+++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
@@ -260,14 +260,14 @@ define i64 @and_select_all_ones_i64_cmp2(i64 %x, i64 %y, i64 %z) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    beqz a5, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a4, a5, 0
+; RV32I-NEXT:    srli a5, a5, 31
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    sltiu a4, a4, 4
+; RV32I-NEXT:    sltiu a5, a4, 4
 ; RV32I-NEXT:  .LBB5_3:
-; RV32I-NEXT:    addi a4, a4, -1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    addi a5, a5, -1
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a0, a5, a0
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    ret
@@ -300,7 +300,7 @@ define i64 @and_select_all_ones_i64_cmp2(i64 %x, i64 %y, i64 %z) {
 ;
 ; ZICOND32-LABEL: and_select_all_ones_i64_cmp2:
 ; ZICOND32:       # %bb.0:
-; ZICOND32-NEXT:    slti a6, a5, 0
+; ZICOND32-NEXT:    srli a6, a5, 31
 ; ZICOND32-NEXT:    sltiu a4, a4, 4
 ; ZICOND32-NEXT:    czero.eqz a6, a6, a5
 ; ZICOND32-NEXT:    czero.nez a4, a4, a5
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index ec1f8ae..3df0707 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -200,7 +200,7 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
 ; RV64I-CCMOV-NEXT:    slti a5, a2, 1
 ; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a2
-; RV64I-CCMOV-NEXT:    slti a5, a2, 0
+; RV64I-CCMOV-NEXT:    srli a5, a2, 63
 ; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a3, a0
 ; RV64I-CCMOV-NEXT:    lw a1, 0(a1)
 ; RV64I-CCMOV-NEXT:    slti a3, a4, 1025
@@ -384,11 +384,11 @@ define i64 @select_sge_int32min(i64 %x, i64 %y, i64 %z) {
 ; RV32I-NEXT:    li a6, -1
 ; RV32I-NEXT:    bne a1, a6, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    xori a0, a0, 1
+; RV32I-NEXT:    srli a1, a1, 31
+; RV32I-NEXT:    xori a0, a1, 1
 ; RV32I-NEXT:  .LBB3_3:
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  # %bb.4:
diff --git a/llvm/test/CodeGen/RISCV/select-constant-xor.ll b/llvm/test/CodeGen/RISCV/select-constant-xor.ll
index 72313a8..f11fb61 100644
--- a/llvm/test/CodeGen/RISCV/select-constant-xor.ll
+++ b/llvm/test/CodeGen/RISCV/select-constant-xor.ll
@@ -48,8 +48,8 @@ define i64 @selecti64i64(i64 %a) {
 define i32 @selecti64i32(i64 %a) {
 ; RV32-LABEL: selecti64i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slti a0, a1, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    xori a0, a1, 1
 ; RV32-NEXT:    lui a1, 524288
 ; RV32-NEXT:    sub a0, a1, a0
 ; RV32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll
index 3fbaeff..fa1807c 100644
--- a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll
@@ -76,12 +76,19 @@ define i32 @not_pos_sel_same_variable(i32 signext %a) {
 
 ; Compare if positive and select of constants where one constant is zero.
 define i32 @pos_sel_constants(i32 signext %a) {
-; CHECK-LABEL: pos_sel_constants:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    slti a0, a0, 0
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    andi a0, a0, 5
-; CHECK-NEXT:    ret
+; RV32-LABEL: pos_sel_constants:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    andi a0, a0, 5
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: pos_sel_constants:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    andi a0, a0, 5
+; RV64-NEXT:    ret
   %tmp.1 = icmp sgt i32 %a, -1
   %retval = select i1 %tmp.1, i32 5, i32 0
   ret i32 %retval
@@ -101,7 +108,7 @@ define i32 @pos_sel_special_constant(i32 signext %a) {
 ;
 ; RV64-LABEL: pos_sel_special_constant:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slti a0, a0, 0
+; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 9
 ; RV64-NEXT:    ret
@@ -114,14 +121,14 @@ define i32 @pos_sel_special_constant(i32 signext %a) {
 define i32 @pos_sel_variable_and_zero(i32 signext %a, i32 signext %b) {
 ; RV32I-LABEL: pos_sel_variable_and_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srli a0, a0, 31
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: pos_sel_variable_and_zero:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srli a0, a0, 63
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/stack-folding.ll b/llvm/test/CodeGen/RISCV/stack-folding.ll
index 8373a74..0e32911 100644
--- a/llvm/test/CodeGen/RISCV/stack-folding.ll
+++ b/llvm/test/CodeGen/RISCV/stack-folding.ll
@@ -31,8 +31,8 @@ define i1 @test_sext_w(i64 %x, i32 %y) nounwind {
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    j .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %truebb
-; CHECK-NEXT:    lw a0, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    slti a0, a0, 0
+; CHECK-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    srliw a0, a0, 31
 ; CHECK-NEXT:  .LBB0_3: # %falsebb
 ; CHECK-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index a30593d..2751332c 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -14,7 +14,7 @@ define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a3, a0, a1
 ; RV32-NEXT:    slt a0, a3, a0
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    ret
@@ -32,7 +32,7 @@ define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a3, a0, a1
 ; RV32ZBA-NEXT:    slt a0, a3, a0
-; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    srli a1, a1, 31
 ; RV32ZBA-NEXT:    xor a0, a1, a0
 ; RV32ZBA-NEXT:    sw a3, 0(a2)
 ; RV32ZBA-NEXT:    ret
@@ -50,7 +50,7 @@ define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a3, a0, a1
 ; RV32ZICOND-NEXT:    slt a0, a3, a0
-; RV32ZICOND-NEXT:    slti a1, a1, 0
+; RV32ZICOND-NEXT:    srli a1, a1, 31
 ; RV32ZICOND-NEXT:    xor a0, a1, a0
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
 ; RV32ZICOND-NEXT:    ret
@@ -252,8 +252,8 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    not a3, a3
 ; RV32-NEXT:    add a5, a5, a0
 ; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    slti a0, a1, 0
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    sw a2, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
 ; RV32-NEXT:    ret
@@ -262,7 +262,7 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    slt a0, a3, a0
-; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    sd a3, 0(a2)
 ; RV64-NEXT:    ret
@@ -276,8 +276,8 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    not a3, a3
 ; RV32ZBA-NEXT:    add a5, a5, a0
 ; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    and a1, a3, a1
-; RV32ZBA-NEXT:    slti a0, a1, 0
+; RV32ZBA-NEXT:    and a0, a3, a1
+; RV32ZBA-NEXT:    srli a0, a0, 31
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
 ; RV32ZBA-NEXT:    ret
@@ -286,7 +286,7 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    add a3, a0, a1
 ; RV64ZBA-NEXT:    slt a0, a3, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    srli a1, a1, 63
 ; RV64ZBA-NEXT:    xor a0, a1, a0
 ; RV64ZBA-NEXT:    sd a3, 0(a2)
 ; RV64ZBA-NEXT:    ret
@@ -300,8 +300,8 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    not a3, a3
 ; RV32ZICOND-NEXT:    add a5, a5, a0
 ; RV32ZICOND-NEXT:    xor a1, a1, a5
-; RV32ZICOND-NEXT:    and a1, a3, a1
-; RV32ZICOND-NEXT:    slti a0, a1, 0
+; RV32ZICOND-NEXT:    and a0, a3, a1
+; RV32ZICOND-NEXT:    srli a0, a0, 31
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
@@ -310,7 +310,7 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a3, a0, a1
 ; RV64ZICOND-NEXT:    slt a0, a3, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    srli a1, a1, 63
 ; RV64ZICOND-NEXT:    xor a0, a1, a0
 ; RV64ZICOND-NEXT:    sd a3, 0(a2)
 ; RV64ZICOND-NEXT:    ret
@@ -330,8 +330,8 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    sltu a0, a3, a0
 ; RV32-NEXT:    add a5, a1, a0
 ; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    and a1, a4, a1
-; RV32-NEXT:    slti a0, a1, 0
+; RV32-NEXT:    and a0, a4, a1
+; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    sw a5, 4(a2)
 ; RV32-NEXT:    ret
@@ -350,8 +350,8 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-NEXT:    sltu a0, a3, a0
 ; RV32ZBA-NEXT:    add a5, a1, a0
 ; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    and a1, a4, a1
-; RV32ZBA-NEXT:    slti a0, a1, 0
+; RV32ZBA-NEXT:    and a0, a4, a1
+; RV32ZBA-NEXT:    srli a0, a0, 31
 ; RV32ZBA-NEXT:    sw a3, 0(a2)
 ; RV32ZBA-NEXT:    sw a5, 4(a2)
 ; RV32ZBA-NEXT:    ret
@@ -370,8 +370,8 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    sltu a0, a3, a0
 ; RV32ZICOND-NEXT:    add a5, a1, a0
 ; RV32ZICOND-NEXT:    xor a1, a1, a5
-; RV32ZICOND-NEXT:    and a1, a4, a1
-; RV32ZICOND-NEXT:    slti a0, a1, 0
+; RV32ZICOND-NEXT:    and a0, a4, a1
+; RV32ZICOND-NEXT:    srli a0, a0, 31
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
 ; RV32ZICOND-NEXT:    sw a5, 4(a2)
 ; RV32ZICOND-NEXT:    ret
@@ -399,7 +399,7 @@ define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    addi a4, a0, -1
 ; RV32-NEXT:    xor a0, a1, a4
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    sw a4, 4(a2)
 ; RV32-NEXT:    ret
@@ -420,7 +420,7 @@ define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-NEXT:    addi a4, a0, -1
 ; RV32ZBA-NEXT:    xor a0, a1, a4
 ; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    slti a0, a0, 0
+; RV32ZBA-NEXT:    srli a0, a0, 31
 ; RV32ZBA-NEXT:    sw a3, 0(a2)
 ; RV32ZBA-NEXT:    sw a4, 4(a2)
 ; RV32ZBA-NEXT:    ret
@@ -441,7 +441,7 @@ define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    addi a4, a0, -1
 ; RV32ZICOND-NEXT:    xor a0, a1, a4
 ; RV32ZICOND-NEXT:    and a0, a1, a0
-; RV32ZICOND-NEXT:    slti a0, a0, 0
+; RV32ZICOND-NEXT:    srli a0, a0, 31
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
 ; RV32ZICOND-NEXT:    sw a4, 4(a2)
 ; RV32ZICOND-NEXT:    ret
@@ -866,8 +866,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    sub a2, a0, a2
 ; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    slti a0, a1, 0
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    sw a2, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
 ; RV32-NEXT:    ret
@@ -889,8 +889,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    sub a2, a0, a2
 ; RV32ZBA-NEXT:    sub a5, a6, a5
 ; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    and a1, a3, a1
-; RV32ZBA-NEXT:    slti a0, a1, 0
+; RV32ZBA-NEXT:    and a0, a3, a1
+; RV32ZBA-NEXT:    srli a0, a0, 31
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
 ; RV32ZBA-NEXT:    ret
@@ -912,8 +912,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    sub a2, a0, a2
 ; RV32ZICOND-NEXT:    sub a5, a6, a5
 ; RV32ZICOND-NEXT:    xor a1, a1, a5
-; RV32ZICOND-NEXT:    and a1, a3, a1
-; RV32ZICOND-NEXT:    slti a0, a1, 0
+; RV32ZICOND-NEXT:    and a0, a3, a1
+; RV32ZICOND-NEXT:    srli a0, a0, 31
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
@@ -1963,7 +1963,7 @@ define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a2, a0, a1
 ; RV32-NEXT:    slt a2, a2, a0
-; RV32-NEXT:    slti a3, a1, 0
+; RV32-NEXT:    srli a3, a1, 31
 ; RV32-NEXT:    bne a3, a2, .LBB28_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a1
@@ -1984,7 +1984,7 @@ define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a2, a0, a1
 ; RV32ZBA-NEXT:    slt a2, a2, a0
-; RV32ZBA-NEXT:    slti a3, a1, 0
+; RV32ZBA-NEXT:    srli a3, a1, 31
 ; RV32ZBA-NEXT:    bne a3, a2, .LBB28_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a1
@@ -2004,7 +2004,7 @@ define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZICOND-LABEL: saddo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a1
-; RV32ZICOND-NEXT:    slti a3, a1, 0
+; RV32ZICOND-NEXT:    srli a3, a1, 31
 ; RV32ZICOND-NEXT:    slt a2, a2, a0
 ; RV32ZICOND-NEXT:    xor a2, a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
@@ -2033,7 +2033,7 @@ define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a2, a0, a1
 ; RV32-NEXT:    slt a0, a2, a0
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
@@ -2050,7 +2050,7 @@ define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a2, a0, a1
 ; RV32ZBA-NEXT:    slt a0, a2, a0
-; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    srli a1, a1, 31
 ; RV32ZBA-NEXT:    xor a0, a1, a0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
@@ -2067,7 +2067,7 @@ define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a1
 ; RV32ZICOND-NEXT:    slt a0, a2, a0
-; RV32ZICOND-NEXT:    slti a1, a1, 0
+; RV32ZICOND-NEXT:    srli a1, a1, 31
 ; RV32ZICOND-NEXT:    xor a0, a1, a0
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
@@ -2108,7 +2108,7 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    add a2, a0, a1
 ; RV64-NEXT:    slt a2, a2, a0
-; RV64-NEXT:    slti a3, a1, 0
+; RV64-NEXT:    srli a3, a1, 63
 ; RV64-NEXT:    bne a3, a2, .LBB30_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
@@ -2136,7 +2136,7 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    add a2, a0, a1
 ; RV64ZBA-NEXT:    slt a2, a2, a0
-; RV64ZBA-NEXT:    slti a3, a1, 0
+; RV64ZBA-NEXT:    srli a3, a1, 63
 ; RV64ZBA-NEXT:    bne a3, a2, .LBB30_2
 ; RV64ZBA-NEXT:  # %bb.1: # %entry
 ; RV64ZBA-NEXT:    mv a0, a1
@@ -2153,7 +2153,7 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    not a5, a5
 ; RV32ZICOND-NEXT:    xor a4, a1, a4
 ; RV32ZICOND-NEXT:    and a4, a5, a4
-; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srli a4, a4, 31
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -2165,7 +2165,7 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZICOND-LABEL: saddo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    slti a3, a1, 0
+; RV64ZICOND-NEXT:    srli a3, a1, 63
 ; RV64ZICOND-NEXT:    slt a2, a2, a0
 ; RV64ZICOND-NEXT:    xor a2, a3, a2
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
@@ -2190,7 +2190,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    not a1, a3
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2198,7 +2198,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    add a2, a0, a1
 ; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    xor a0, a1, a0
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
@@ -2213,7 +2213,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a1, a0
 ; RV32ZBA-NEXT:    not a1, a3
 ; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    slti a0, a0, 0
+; RV32ZBA-NEXT:    srli a0, a0, 31
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2221,7 +2221,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    add a2, a0, a1
 ; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    srli a1, a1, 63
 ; RV64ZBA-NEXT:    xor a0, a1, a0
 ; RV64ZBA-NEXT:    xori a0, a0, 1
 ; RV64ZBA-NEXT:    ret
@@ -2236,7 +2236,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a1, a0
 ; RV32ZICOND-NEXT:    not a1, a3
 ; RV32ZICOND-NEXT:    and a0, a1, a0
-; RV32ZICOND-NEXT:    slti a0, a0, 0
+; RV32ZICOND-NEXT:    srli a0, a0, 31
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -2244,7 +2244,7 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a2, a0, a1
 ; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    srli a1, a1, 63
 ; RV64ZICOND-NEXT:    xor a0, a1, a0
 ; RV64ZICOND-NEXT:    xori a0, a0, 1
 ; RV64ZICOND-NEXT:    ret
@@ -2713,7 +2713,7 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a4, a1, a3
 ; RV32ZICOND-NEXT:    xor a5, a1, a5
 ; RV32ZICOND-NEXT:    and a4, a4, a5
-; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srli a4, a4, 31
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -2748,8 +2748,8 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a2, a1, a2
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    slti a0, a1, 0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    xori a0, a1, 1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ssub.not.i64:
@@ -2769,8 +2769,8 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a2, a1, a2
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    and a1, a1, a2
-; RV32ZBA-NEXT:    slti a0, a1, 0
-; RV32ZBA-NEXT:    xori a0, a0, 1
+; RV32ZBA-NEXT:    srli a1, a1, 31
+; RV32ZBA-NEXT:    xori a0, a1, 1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: ssub.not.i64:
@@ -2790,8 +2790,8 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a2, a1, a2
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    and a1, a1, a2
-; RV32ZICOND-NEXT:    slti a0, a1, 0
-; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    srli a1, a1, 31
+; RV32ZICOND-NEXT:    xori a0, a1, 1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ssub.not.i64:
@@ -3821,7 +3821,7 @@ define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a2, a0, a1
 ; RV32-NEXT:    slt a0, a2, a0
-; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    beq a1, a0, .LBB52_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
@@ -3846,7 +3846,7 @@ define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a2, a0, a1
 ; RV32ZBA-NEXT:    slt a0, a2, a0
-; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    srli a1, a1, 31
 ; RV32ZBA-NEXT:    beq a1, a0, .LBB52_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
@@ -3871,7 +3871,7 @@ define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a1
 ; RV32ZICOND-NEXT:    slt a0, a2, a0
-; RV32ZICOND-NEXT:    slti a1, a1, 0
+; RV32ZICOND-NEXT:    srli a1, a1, 31
 ; RV32ZICOND-NEXT:    beq a1, a0, .LBB52_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
@@ -3927,7 +3927,7 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    add a2, a0, a1
 ; RV64-NEXT:    slt a0, a2, a0
-; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    beq a1, a0, .LBB53_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    li a0, 0
@@ -3958,7 +3958,7 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    add a2, a0, a1
 ; RV64ZBA-NEXT:    slt a0, a2, a0
-; RV64ZBA-NEXT:    slti a1, a1, 0
+; RV64ZBA-NEXT:    srli a1, a1, 63
 ; RV64ZBA-NEXT:    beq a1, a0, .LBB53_2
 ; RV64ZBA-NEXT:  # %bb.1: # %overflow
 ; RV64ZBA-NEXT:    li a0, 0
@@ -3989,7 +3989,7 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a2, a0, a1
 ; RV64ZICOND-NEXT:    slt a0, a2, a0
-; RV64ZICOND-NEXT:    slti a1, a1, 0
+; RV64ZICOND-NEXT:    srli a1, a1, 63
 ; RV64ZICOND-NEXT:    beq a1, a0, .LBB53_2
 ; RV64ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV64ZICOND-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index c75bb9d..3bbf333 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -11,7 +11,7 @@ define i32 @addsat(i32 %a, i32 %b) {
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    slt a2, a0, a2
-; RV32I-NEXT:    slti a1, a1, 0
+; RV32I-NEXT:    srli a1, a1, 31
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srai a0, a0, 31
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-insbi.ll b/llvm/test/CodeGen/RISCV/xqcibm-insbi.ll
new file mode 100644
index 0000000..e4a5451
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcibm-insbi.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcibm --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32XQCIBM
+
+define i32 @insbi(i32 %in1) nounwind {
+; RV32I-LABEL: insbi:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, 176
+; RV32I-NEXT:    andi a1, a1, 496
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.insbi a0, 11, 5, 4
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 176
+  %and1 = and i32 %xor1, 496
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_comm_xor(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_comm_xor:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 9
+; RV32I-NEXT:    li a2, 15
+; RV32I-NEXT:    slli a1, a1, 9
+; RV32I-NEXT:    xor a1, a0, a1
+; RV32I-NEXT:    slli a2, a2, 9
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_comm_xor:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.insbi a0, 9, 4, 9
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 4608, %in1
+  %and1 = and i32 %xor1, 7680
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_comm_and(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_comm_and:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    li a2, 15
+; RV32I-NEXT:    slli a1, a1, 9
+; RV32I-NEXT:    xor a1, a0, a1
+; RV32I-NEXT:    slli a2, a2, 9
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_comm_and:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.insbi a0, 11, 4, 9
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 5632
+  %and1 = and i32 7680, %xor1
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_comm_xor2(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_comm_xor2:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, 176
+; RV32I-NEXT:    andi a1, a1, 496
+; RV32I-NEXT:    xor a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_comm_xor2:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.insbi a0, 11, 5, 4
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 176
+  %and1 = and i32 %xor1, 496
+  %xor2 = xor i32 %in1, %and1
+  ret i32 %xor2
+}
+
+define i32 @insbi_immg(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_immg:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, 256
+; RV32I-NEXT:    andi a1, a1, 496
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_immg:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    li a1, 16
+; RV32XQCIBM-NEXT:    qc.insb a0, a1, 5, 4
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 256
+  %and1 = and i32 %xor1, 496
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_not_shifted_mask(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_not_shifted_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, 128
+; RV32I-NEXT:    andi a1, a1, 716
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_not_shifted_mask:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    xori a1, a0, 128
+; RV32XQCIBM-NEXT:    andi a1, a1, 716
+; RV32XQCIBM-NEXT:    xor a0, a0, a1
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 176
+  %and1 = and i32 %xor1, 716
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_width_z(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_width_z:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a0, 256
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_width_z:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    andi a1, a0, 256
+; RV32XQCIBM-NEXT:    xor a0, a0, a1
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 176
+  %and1 = and i32 %xor1, 256
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i32 @insbi_mul_use_and(i32 %in1, i32 %in2) nounwind {
+; RV32I-LABEL: insbi_mul_use_and:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    li a2, 15
+; RV32I-NEXT:    slli a1, a1, 9
+; RV32I-NEXT:    slli a2, a2, 9
+; RV32I-NEXT:    xor a1, a0, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    xor a2, a1, a0
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_mul_use_and:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    li a1, 11
+; RV32XQCIBM-NEXT:    li a2, 15
+; RV32XQCIBM-NEXT:    slli a1, a1, 9
+; RV32XQCIBM-NEXT:    slli a2, a2, 9
+; RV32XQCIBM-NEXT:    xor a1, a1, a0
+; RV32XQCIBM-NEXT:    and a1, a1, a2
+; RV32XQCIBM-NEXT:    xor a2, a1, a0
+; RV32XQCIBM-NEXT:    add a0, a0, a1
+; RV32XQCIBM-NEXT:    add a0, a0, a2
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 5632
+  %and1 = and i32 %xor1, 7680
+  %xor2 = xor i32 %and1, %in1
+  %add1 = add i32 %in1, %and1
+  %add2 = add i32 %add1, %xor2
+  ret i32 %add2
+}
+
+define i32 @insbi_mul_use_xor(i32 %in1, i32 %in2) nounwind {
+; RV32I-LABEL: insbi_mul_use_xor:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, 176
+; RV32I-NEXT:    andi a2, a1, 496
+; RV32I-NEXT:    xor a2, a2, a0
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_mul_use_xor:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    xori a1, a0, 176
+; RV32XQCIBM-NEXT:    andi a2, a1, 496
+; RV32XQCIBM-NEXT:    xor a2, a2, a0
+; RV32XQCIBM-NEXT:    add a0, a0, a1
+; RV32XQCIBM-NEXT:    add a0, a0, a2
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, 176
+  %and1 = and i32 %xor1, 496
+  %xor2 = xor i32 %and1, %in1
+  %add1 = add i32 %in1, %xor1
+  %add2 = add i32 %add1, %xor2
+  ret i32 %add2
+}
+
+define i32 @insbi_imm_too_neg(i32 %in1) nounwind {
+; RV32I-LABEL: insbi_imm_too_neg:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a1, a0, -34
+; RV32I-NEXT:    andi a1, a1, -2
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_imm_too_neg:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    li a1, -17
+; RV32XQCIBM-NEXT:    qc.insb a0, a1, 31, 1
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i32 %in1, -34
+  %and1 = and i32 %xor1, -2
+  %xor2 = xor i32 %and1, %in1
+  ret i32 %xor2
+}
+
+define i64 @insbi_i64(i64 %in1) nounwind {
+; RV32I-LABEL: insbi_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 57344
+; RV32I-NEXT:    lui a3, 1044480
+; RV32I-NEXT:    xor a2, a0, a2
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    zext.b a3, a1
+; RV32I-NEXT:    xor a1, a3, a1
+; RV32I-NEXT:    xor a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_i64:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a2, a1, 8, 0
+; RV32XQCIBM-NEXT:    xor a1, a1, a2
+; RV32XQCIBM-NEXT:    qc.insbi a0, 14, 8, 24
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i64 %in1, 234881024
+  %and1 = and i64 %xor1, 1099494850560
+  %xor2 = xor i64 %and1, %in1
+  ret i64 %xor2
+}
+define i64 @insbi_i64_large_mask(i64 %in1) nounwind {
+; RV32I-LABEL: insbi_i64_large_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xori a2, a1, 9
+; RV32I-NEXT:    andi a2, a2, 15
+; RV32I-NEXT:    xor a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: insbi_i64_large_mask:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.insbi a1, 9, 4, 0
+; RV32XQCIBM-NEXT:    ret
+  %xor1 = xor i64 %in1, 38654705664
+  %and1 = and i64 %xor1, 64424509440
+  %xor2 = xor i64 %and1, %in1
+  ret i64 %xor2
+}
diff --git a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
index 2d48f2b4..17b3534 100644
--- a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
@@ -224,7 +224,7 @@ define i1 @flo(float %c, float %a, float %b) {
 ; CHECK-RV64I-NEXT:    mv a1, s1
 ; CHECK-RV64I-NEXT:    call __gesf2
 ; CHECK-RV64I-NEXT:    or a0, s2, a0
-; CHECK-RV64I-NEXT:    slti a0, a0, 0
+; CHECK-RV64I-NEXT:    srli a0, a0, 63
 ; CHECK-RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -275,7 +275,7 @@ define i1 @dlo(double %c, double %a, double %b) {
 ; CHECK-NEXT:    mv a1, s1
 ; CHECK-NEXT:    call __gedf2
 ; CHECK-NEXT:    or a0, s2, a0
-; CHECK-NEXT:    slti a0, a0, 0
+; CHECK-NEXT:    srli a0, a0, 63
 ; CHECK-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/callbr-asm-loop.ll b/llvm/test/CodeGen/X86/callbr-asm-loop.ll
index 999b04c..0b68988 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-loop.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-loop.ll
@@ -1,35 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 
-; RUN: llc -O0 -mtriple=i686-- < %s | FileCheck %s
+; RUN: llc -O1 -mtriple=i686-- < %s | FileCheck %s
 
 ; Test that causes multiple defs of %eax.
-; FIXME: The testcase hangs with -O1/2/3 enabled.
 define i32 @loop1() nounwind {
 ; CHECK-LABEL: loop1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %tailrecurse
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl $1, %edx
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: # Inline asm indirect target
-; CHECK-NEXT:    # %tailrecurse.tailrecurse.backedge_crit_edge
+; CHECK-NEXT:    # %tailrecurse.tailrecurse_crit_edge
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:  .LBB0_3: # %tailrecurse.backedge
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    jmp .LBB0_1
-; CHECK-NEXT:  .LBB0_4: # Inline asm indirect target
+; CHECK-NEXT:  .LBB0_3: # Inline asm indirect target
 ; CHECK-NEXT:    # %lab2.split
 ; CHECK-NEXT:    # Label of block must be emitted
 ; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
   br label %tailrecurse
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 9728e13..ce03f8f 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -28,8 +28,9 @@ define i64 @PR62286(i32 %a) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -42,10 +43,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -58,12 +59,13 @@ define i64 @PR62286(i32 %a) {
 ; AVX512-LABEL: PR62286:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd %edi, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512-NEXT:    movw $4369, %ax # imm = 0x1111
+; AVX512-NEXT:    movb $8, %al
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpaddd %zmm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm0
+; AVX512-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135d..ef20cf2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1724,6 +1724,269 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
   ret void
 }
 
+define <512 x i8> @PR153457(<512 x i8> %a0, <512 x i8> %a1) nounwind {
+; AVX512F-LABEL: PR153457:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $64, %rsp
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512F-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512F-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512F-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512F-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: PR153457:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm8
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm8
+; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm10
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm5, %xmm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpslld $24, %xmm0, %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm8
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vpsrld $24, %xmm7, %xmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $3, %xmm7, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,1,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpbroadcastb 16(%rbp), %ymm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: PR153457:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:    andq $-64, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512DQ-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512DQ-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: PR153457:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-64, %rsp
+; AVX512VBMI-NEXT:    subq $64, %rsp
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm5, %zmm8
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm4, %zmm5
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0]
+; AVX512VBMI-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm3, %zmm4
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67,0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67]
+; AVX512VBMI-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm2, %zmm3
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,71]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm6, %zmm2
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,0,64,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm0, %zmm6
+; AVX512VBMI-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [67,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,68,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm4, %zmm3
+; AVX512VBMI-NEXT:    vinserti32x4 $3, %xmm7, %zmm5, %zmm4
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm8, %zmm5
+; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,65,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm1, %zmm8
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, 128(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm8, 64(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm6, (%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, 384(%rdi)
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %shuffle1 = shufflevector <512 x i8> %a0, <512 x i8> zeroinitializer, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle2 = shufflevector <512 x i8> %shuffle1, <512 x i8> %a1, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 512, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 513, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 514, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 515, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 516, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 517, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 518, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 519, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <512 x i8> %shuffle2
+}
+
 define <64 x i8> @shuffle_v32i16_zextinreg_to_v16i32(<64 x i8> %a)  {
 ; ALL-LABEL: shuffle_v32i16_zextinreg_to_v16i32:
 ; ALL:       # %bb.0:
diff --git a/llvm/test/MC/AArch64/armv8.6a-fgt.s b/llvm/test/MC/AArch64/armv8.6a-fgt.s
index 11002ac..632a531b 100644
--- a/llvm/test/MC/AArch64/armv8.6a-fgt.s
+++ b/llvm/test/MC/AArch64/armv8.6a-fgt.s
@@ -1,75 +1,149 @@
-// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+fgt   < %s | FileCheck %s
-// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding  < %s 2>&1         | FileCheck %s --check-prefix=NOFGT
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.6a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fgt < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fgt < %s \
+// RUN:        | llvm-objdump -d --mattr=+fgt - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fgt < %s \
+// RUN:   | llvm-objdump -d --mattr=-fgt - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fgt < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+fgt -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 msr HFGRTR_EL2, x0
+// CHECK-INST: msr HFGRTR_EL2, x0
+// CHECK-ENCODING: encoding: [0x80,0x11,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c1180      msr S3_4_C1_C1_4, x0
+
 msr HFGWTR_EL2, x5
+// CHECK-INST: msr HFGWTR_EL2, x5
+// CHECK-ENCODING: encoding: [0xa5,0x11,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c11a5      msr S3_4_C1_C1_5, x5
+
 msr HFGITR_EL2, x10
+// CHECK-INST: msr HFGITR_EL2, x10
+// CHECK-ENCODING: encoding: [0xca,0x11,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c11ca      msr S3_4_C1_C1_6, x10
+
 msr HDFGRTR_EL2, x15
+// CHECK-INST: msr HDFGRTR_EL2, x15
+// CHECK-ENCODING: encoding: [0x8f,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c318f      msr S3_4_C3_C1_4, x15
+
 msr HDFGWTR_EL2, x20
+// CHECK-INST: msr HDFGWTR_EL2, x20
+// CHECK-ENCODING: encoding: [0xb4,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c31b4      msr S3_4_C3_C1_5, x20
+
 msr HAFGRTR_EL2, x25
-// CHECK: msr     HFGRTR_EL2, x0          // encoding: [0x80,0x11,0x1c,0xd5]
-// CHECK: msr     HFGWTR_EL2, x5          // encoding: [0xa5,0x11,0x1c,0xd5]
-// CHECK: msr     HFGITR_EL2, x10         // encoding: [0xca,0x11,0x1c,0xd5]
-// CHECK: msr     HDFGRTR_EL2, x15        // encoding: [0x8f,0x31,0x1c,0xd5]
-// CHECK: msr     HDFGWTR_EL2, x20        // encoding: [0xb4,0x31,0x1c,0xd5]
-// CHECK: msr     HAFGRTR_EL2, x25        // encoding: [0xd9,0x31,0x1c,0xd5]
-// NOFGT: error: expected writable system register or pstate
-// NOFGT: error: expected writable system register or pstate
-// NOFGT: error: expected writable system register or pstate
-// NOFGT: error: expected writable system register or pstate
-// NOFGT: error: expected writable system register or pstate
-// NOFGT: error: expected writable system register or pstate
+// CHECK-INST: msr HAFGRTR_EL2, x25
+// CHECK-ENCODING: encoding: [0xd9,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c31d9      msr S3_4_C3_C1_6, x25
 
 mrs x30,  HFGRTR_EL2
+// CHECK-INST: mrs x30, HFGRTR_EL2
+// CHECK-ENCODING: encoding: [0x9e,0x11,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c119e      mrs x30, S3_4_C1_C1_4
+
 mrs x25,  HFGWTR_EL2
+// CHECK-INST: mrs x25, HFGWTR_EL2
+// CHECK-ENCODING: encoding: [0xb9,0x11,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c11b9      mrs x25, S3_4_C1_C1_5
+
 mrs x20,  HFGITR_EL2
+// CHECK-INST: mrs x20, HFGITR_EL2
+// CHECK-ENCODING: encoding: [0xd4,0x11,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c11d4      mrs x20, S3_4_C1_C1_6
+
 mrs x15,  HDFGRTR_EL2
+// CHECK-INST: mrs x15, HDFGRTR_EL2
+// CHECK-ENCODING: encoding: [0x8f,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c318f      mrs x15, S3_4_C3_C1_4
+
 mrs x10,  HDFGWTR_EL2
-mrs x5,   HAFGRTR_EL2
-// CHECK: mrs     x30, HFGRTR_EL2         // encoding: [0x9e,0x11,0x3c,0xd5]
-// CHECK: mrs     x25, HFGWTR_EL2         // encoding: [0xb9,0x11,0x3c,0xd5]
-// CHECK: mrs     x20, HFGITR_EL2         // encoding: [0xd4,0x11,0x3c,0xd5]
-// CHECK: mrs     x15, HDFGRTR_EL2        // encoding: [0x8f,0x31,0x3c,0xd5]
-// CHECK: mrs     x10, HDFGWTR_EL2        // encoding: [0xaa,0x31,0x3c,0xd5]
-// CHECK: mrs     x5, HAFGRTR_EL2         // encoding: [0xc5,0x31,0x3c,0xd5]
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
+// CHECK-INST: mrs x10, HDFGWTR_EL2
+// CHECK-ENCODING: encoding: [0xaa,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c31aa      mrs x10, S3_4_C3_C1_5
 
+mrs x5,   HAFGRTR_EL2
+// CHECK-INST: mrs x5, HAFGRTR_EL2
+// CHECK-ENCODING: encoding: [0xc5,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:11: error: expected readable system register
+// CHECK-UNKNOWN:  d53c31c5      mrs x5, S3_4_C3_C1_6
 
 mrs x3, HDFGRTR2_EL2
+// CHECK-INST: mrs x3, HDFGRTR2_EL2
+// CHECK-ENCODING: encoding: [0x03,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:9: error: expected readable system register
+// CHECK-UNKNOWN:  d53c3103      mrs x3, S3_4_C3_C1_0
+
 mrs x3, HDFGWTR2_EL2
+// CHECK-INST: mrs x3, HDFGWTR2_EL2
+// CHECK-ENCODING: encoding: [0x23,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:9: error: expected readable system register
+// CHECK-UNKNOWN:  d53c3123      mrs x3, S3_4_C3_C1_1
+
 mrs x3, HFGRTR2_EL2
+// CHECK-INST: mrs x3, HFGRTR2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:9: error: expected readable system register
+// CHECK-UNKNOWN:  d53c3143      mrs x3, S3_4_C3_C1_2
+
 mrs x3, HFGWTR2_EL2
-mrs x3, HFGITR2_EL2
-// CHECK: mrs     x3, HDFGRTR2_EL2                // encoding: [0x03,0x31,0x3c,0xd5]
-// CHECK: mrs     x3, HDFGWTR2_EL2                // encoding: [0x23,0x31,0x3c,0xd5]
-// CHECK: mrs     x3, HFGRTR2_EL2                 // encoding: [0x43,0x31,0x3c,0xd5]
-// CHECK: mrs     x3, HFGWTR2_EL2                 // encoding: [0x63,0x31,0x3c,0xd5]
-// CHECK: mrs     x3, HFGITR2_EL2                 // encoding: [0xe3,0x31,0x3c,0xd5]
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
-// NOFGT: error: expected readable system register
+// CHECK-INST: mrs x3, HFGWTR2_EL2
+// CHECK-ENCODING: encoding: [0x63,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:9: error: expected readable system register
+// CHECK-UNKNOWN:  d53c3163      mrs x3, S3_4_C3_C1_3
 
+mrs x3, HFGITR2_EL2
+// CHECK-INST: mrs x3, HFGITR2_EL2
+// CHECK-ENCODING: encoding: [0xe3,0x31,0x3c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:9: error: expected readable system register
+// CHECK-UNKNOWN:  d53c31e3      mrs x3, S3_4_C3_C1_7
 
 msr HDFGRTR2_EL2, x3
+// CHECK-INST: msr HDFGRTR2_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c3103      msr S3_4_C3_C1_0, x3
+
 msr HDFGWTR2_EL2, x3
+// CHECK-INST: msr HDFGWTR2_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c3123      msr S3_4_C3_C1_1, x3
+
 msr HFGRTR2_EL2, x3
+// CHECK-INST: msr HFGRTR2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c3143      msr S3_4_C3_C1_2, x3
+
 msr HFGWTR2_EL2, x3
+// CHECK-INST: msr HFGWTR2_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c3163      msr S3_4_C3_C1_3, x3
+
 msr HFGITR2_EL2, x3
-// CHECK: msr     HDFGRTR2_EL2, x3                // encoding: [0x03,0x31,0x1c,0xd5]
-// CHECK: msr     HDFGWTR2_EL2, x3                // encoding: [0x23,0x31,0x1c,0xd5]
-// CHECK: msr     HFGRTR2_EL2, x3                 // encoding: [0x43,0x31,0x1c,0xd5]
-// CHECK: msr     HFGWTR2_EL2, x3                 // encoding: [0x63,0x31,0x1c,0xd5]
-// CHECK: msr     HFGITR2_EL2, x3                 // encoding: [0xe3,0x31,0x1c,0xd5]
-// NOFGT: error: expected writable system register
-// NOFGT: error: expected writable system register
-// NOFGT: error: expected writable system register
-// NOFGT: error: expected writable system register
-// NOFGT: error: expected writable system register
+// CHECK-INST: msr HFGITR2_EL2, x3
+// CHECK-ENCODING: encoding: [0xe3,0x31,0x1c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:5: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c31e3      msr S3_4_C3_C1_7, x3
diff --git a/llvm/test/MC/AArch64/armv8.8a-mops-diagnostics.s b/llvm/test/MC/AArch64/armv8.8a-mops-diagnostics.s
new file mode 100644
index 0000000..a9a8612
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv8.8a-mops-diagnostics.s
@@ -0,0 +1,227 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mops,+mte < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.8a,+mte < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+
+// All operand must be different from each other
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpyfp [x0]!, [x0]!, x1!
+cpyfp [x0]!, [x1]!, x0!
+cpyfp [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpyfm [x0]!, [x0]!, x1!
+cpyfm [x0]!, [x1]!, x0!
+cpyfm [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpyfe [x0]!, [x0]!, x1!
+cpyfe [x0]!, [x1]!, x0!
+cpyfe [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpyp [x0]!, [x0]!, x1!
+cpyp [x0]!, [x1]!, x0!
+cpyp [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpym [x0]!, [x0]!, x1!
+cpym [x0]!, [x1]!, x0!
+cpym [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
+cpye [x0]!, [x0]!, x1!
+cpye [x0]!, [x1]!, x0!
+cpye [x1]!, [x0]!, x0!
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+setp [x0]!, x0!, x1
+setp [x0]!, x1!, x0
+setp [x1]!, x0!, x0
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+setm [x0]!, x0!, x1
+setm [x0]!, x1!, x0
+setm [x1]!, x0!, x0
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+sete [x0]!, x0!, x1
+sete [x0]!, x1!, x0
+sete [x1]!, x0!, x0
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+setgp [x0]!, x0!, x1
+setgp [x0]!, x1!, x0
+setgp [x1]!, x0!, x0
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+setgm [x0]!, x0!, x1
+setgm [x0]!, x1!, x0
+setgm [x1]!, x0!, x0
+
+// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
+// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
+// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
+setge [x0]!, x0!, x1
+setge [x0]!, x1!, x0
+setge [x1]!, x0!, x0
+
+// SP cannot be used as argument at any position
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfp [sp]!, [x1]!, x2!
+cpyfp [x0]!, [sp]!, x2!
+cpyfp [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfm [sp]!, [x1]!, x2!
+cpyfm [x0]!, [sp]!, x2!
+cpyfm [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfe [sp]!, [x1]!, x2!
+cpyfe [x0]!, [sp]!, x2!
+cpyfe [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyp [sp]!, [x2]!, x2!
+cpyp [x0]!, [sp]!, x2!
+cpyp [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpym [sp]!, [x2]!, x2!
+cpym [x0]!, [sp]!, x2!
+cpym [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpye [sp]!, [x2]!, x2!
+cpye [x0]!, [sp]!, x2!
+cpye [x0]!, [x1]!, sp!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setp [sp]!, x1!, x2
+setp [x0]!, sp!, x2
+setp [x0]!, x1!, sp
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setm [sp]!, x1!, x2
+setm [x0]!, sp!, x2
+setm [x0]!, x1!, sp
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+sete [sp]!, x1!, x2
+sete [x0]!, sp!, x2
+sete [x0]!, x1!, sp
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setgp [sp]!, x1!, x2
+setgp [x0]!, sp!, x2
+setgp [x0]!, x1!, sp
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setgm [sp]!, x1!, x2
+setgm [x0]!, sp!, x2
+setgm [x0]!, x1!, sp
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+setge [sp]!, x1!, x2
+setge [x0]!, sp!, x2
+setge [x0]!, x1!, sp
+
+// XZR can only be used at:
+//  - the size operand in CPY.
+//  - the size or source operands in SET.
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfp [xzr]!, [x1]!, x2!
+cpyfp [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfm [xzr]!, [x1]!, x2!
+cpyfm [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyfe [xzr]!, [x1]!, x2!
+cpyfe [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpyp [xzr]!, [x2]!, x2!
+cpyp [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpym [xzr]!, [x2]!, x2!
+cpym [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
+cpye [xzr]!, [x2]!, x2!
+cpye [x0]!, [xzr]!, x2!
+
+// CHECK-ERROR: error: invalid operand for instruction
+setp [xzr]!, x1!, x2
+
+// CHECK-ERROR: error: invalid operand for instruction
+setm [xzr]!, x1!, x2
+
+// CHECK-ERROR: error: invalid operand for instruction
+sete [xzr]!, x1!, x2
+
+// CHECK-ERROR: error: invalid operand for instruction
+setgp [xzr]!, x1!, x2
+
+// CHECK-ERROR: error: invalid operand for instruction
+setgm [xzr]!, x1!, x2
+
+// CHECK-ERROR: error: invalid operand for instruction
+setge [xzr]!, x1!, x2
diff --git a/llvm/test/MC/AArch64/armv8.8a-mops.s b/llvm/test/MC/AArch64/armv8.8a-mops.s
index f8d75e7..10a551d 100644
--- a/llvm/test/MC/AArch64/armv8.8a-mops.s
+++ b/llvm/test/MC/AArch64/armv8.8a-mops.s
@@ -1,654 +1,849 @@
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mops,+mte < %s 2> %t | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MTE
-// RUN: FileCheck --check-prefix=CHECK-ERROR %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.8a,+mte < %s 2> %t | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MTE
-// RUN: FileCheck --check-prefix=CHECK-ERROR %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mops < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-NO-MTE-ERR %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.8a < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-NO-MTE-ERR %s < %t
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-NO-MOPS-ERR --check-prefix=CHECK-NO-MOPSMTE-ERR %s < %t
-
-// CHECK:      [0x40,0x04,0x01,0x19]
-// CHECK-NEXT: [0x40,0x44,0x01,0x19]
-// CHECK-NEXT: [0x40,0x84,0x01,0x19]
-// CHECK-NEXT: [0x40,0xc4,0x01,0x19]
-// CHECK-NEXT: [0x40,0x14,0x01,0x19]
-// CHECK-NEXT: [0x40,0x54,0x01,0x19]
-// CHECK-NEXT: [0x40,0x94,0x01,0x19]
-// CHECK-NEXT: [0x40,0xd4,0x01,0x19]
-// CHECK-NEXT: [0x40,0x24,0x01,0x19]
-// CHECK-NEXT: [0x40,0x64,0x01,0x19]
-// CHECK-NEXT: [0x40,0xa4,0x01,0x19]
-// CHECK-NEXT: [0x40,0xe4,0x01,0x19]
-// CHECK-NEXT: [0x40,0x34,0x01,0x19]
-// CHECK-NEXT: [0x40,0x74,0x01,0x19]
-// CHECK-NEXT: [0x40,0xb4,0x01,0x19]
-// CHECK-NEXT: [0x40,0xf4,0x01,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops,+mte < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.8a,+mte < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops,+mte < %s \
+// RUN:        | llvm-objdump -d --mattr=+mops,+mte - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mops,+mte < %s \
+// RUN:   | llvm-objdump -d --mattr=-mops,-mte - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mops,+mte < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mops,+mte -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+
 cpyfp [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfp [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19010440      <unknown>
+
 cpyfpwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19014440      <unknown>
+
 cpyfprn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfprn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19018440      <unknown>
+
 cpyfpn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901c440      <unknown>
+
 cpyfpwt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpwt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19011440      <unknown>
+
 cpyfpwtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpwtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19015440      <unknown>
+
 cpyfpwtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpwtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19019440      <unknown>
+
 cpyfpwtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpwtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901d440      <unknown>
+
 cpyfprt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfprt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19012440      <unknown>
+
 cpyfprtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfprtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19016440      <unknown>
+
 cpyfprtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfprtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901a440      <unknown>
+
 cpyfprtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfprtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901e440      <unknown>
+
 cpyfpt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfpt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19013440      <unknown>
+
 cpyfptwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfptwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19017440      <unknown>
+
 cpyfptrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfptrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901b440      <unknown>
+
 cpyfptn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfptn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1901f440      <unknown>
 
-// CHECK:      [0x40,0x04,0x41,0x19]
-// CHECK-NEXT: [0x40,0x44,0x41,0x19]
-// CHECK-NEXT: [0x40,0x84,0x41,0x19]
-// CHECK-NEXT: [0x40,0xc4,0x41,0x19]
-// CHECK-NEXT: [0x40,0x14,0x41,0x19]
-// CHECK-NEXT: [0x40,0x54,0x41,0x19]
-// CHECK-NEXT: [0x40,0x94,0x41,0x19]
-// CHECK-NEXT: [0x40,0xd4,0x41,0x19]
-// CHECK-NEXT: [0x40,0x24,0x41,0x19]
-// CHECK-NEXT: [0x40,0x64,0x41,0x19]
-// CHECK-NEXT: [0x40,0xa4,0x41,0x19]
-// CHECK-NEXT: [0x40,0xe4,0x41,0x19]
-// CHECK-NEXT: [0x40,0x34,0x41,0x19]
-// CHECK-NEXT: [0x40,0x74,0x41,0x19]
-// CHECK-NEXT: [0x40,0xb4,0x41,0x19]
-// CHECK-NEXT: [0x40,0xf4,0x41,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 cpyfm [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfm [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19410440      <unknown>
+
 cpyfmwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19414440      <unknown>
+
 cpyfmrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19418440      <unknown>
+
 cpyfmn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941c440      <unknown>
+
 cpyfmwt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmwt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19411440      <unknown>
+
 cpyfmwtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmwtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19415440      <unknown>
+
 cpyfmwtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmwtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19419440      <unknown>
+
 cpyfmwtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmwtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941d440      <unknown>
+
 cpyfmrt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmrt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19412440      <unknown>
+
 cpyfmrtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmrtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19416440      <unknown>
+
 cpyfmrtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmrtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941a440      <unknown>
+
 cpyfmrtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmrtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941e440      <unknown>
+
 cpyfmt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19413440      <unknown>
+
 cpyfmtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19417440      <unknown>
+
 cpyfmtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941b440      <unknown>
+
 cpyfmtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfmtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1941f440      <unknown>
 
-// CHECK:      [0x40,0x04,0x81,0x19]
-// CHECK-NEXT: [0x40,0x44,0x81,0x19]
-// CHECK-NEXT: [0x40,0x84,0x81,0x19]
-// CHECK-NEXT: [0x40,0xc4,0x81,0x19]
-// CHECK-NEXT: [0x40,0x14,0x81,0x19]
-// CHECK-NEXT: [0x40,0x54,0x81,0x19]
-// CHECK-NEXT: [0x40,0x94,0x81,0x19]
-// CHECK-NEXT: [0x40,0xd4,0x81,0x19]
-// CHECK-NEXT: [0x40,0x24,0x81,0x19]
-// CHECK-NEXT: [0x40,0x64,0x81,0x19]
-// CHECK-NEXT: [0x40,0xa4,0x81,0x19]
-// CHECK-NEXT: [0x40,0xe4,0x81,0x19]
-// CHECK-NEXT: [0x40,0x34,0x81,0x19]
-// CHECK-NEXT: [0x40,0x74,0x81,0x19]
-// CHECK-NEXT: [0x40,0xb4,0x81,0x19]
-// CHECK-NEXT: [0x40,0xf4,0x81,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 cpyfe [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfe [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19810440      <unknown>
+
 cpyfewn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfewn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19814440      <unknown>
+
 cpyfern [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfern [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19818440      <unknown>
+
 cpyfen [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfen [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981c440      <unknown>
+
 cpyfewt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfewt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19811440      <unknown>
+
 cpyfewtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfewtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19815440      <unknown>
+
 cpyfewtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfewtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19819440      <unknown>
+
 cpyfewtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfewtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981d440      <unknown>
+
 cpyfert [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfert [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19812440      <unknown>
+
 cpyfertwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfertwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19816440      <unknown>
+
 cpyfertrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfertrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981a440      <unknown>
+
 cpyfertn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfertn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981e440      <unknown>
+
 cpyfet [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfet [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19813440      <unknown>
+
 cpyfetwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfetwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19817440      <unknown>
+
 cpyfetrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfetrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981b440      <unknown>
+
 cpyfetn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyfetn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1981f440      <unknown>
 
-// CHECK:      [0x40,0x04,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x44,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x84,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xc4,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x14,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x54,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x94,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xd4,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x24,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x64,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xa4,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xe4,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x34,0x01,0x1d]
-// CHECK-NEXT: [0x40,0x74,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xb4,0x01,0x1d]
-// CHECK-NEXT: [0x40,0xf4,0x01,0x1d]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 cpyp [x0]!, [x1]!, x2!
+// CHECK-INST: cpyp [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d010440      <unknown>
+
 cpypwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpypwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d014440      <unknown>
+
 cpyprn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyprn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d018440      <unknown>
+
 cpypn [x0]!, [x1]!, x2!
+// CHECK-INST: cpypn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01c440      <unknown>
+
 cpypwt [x0]!, [x1]!, x2!
+// CHECK-INST: cpypwt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d011440      <unknown>
+
 cpypwtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpypwtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d015440      <unknown>
+
 cpypwtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpypwtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d019440      <unknown>
+
 cpypwtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpypwtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01d440      <unknown>
+
 cpyprt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyprt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d012440      <unknown>
+
 cpyprtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyprtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d016440      <unknown>
+
 cpyprtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyprtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01a440      <unknown>
+
 cpyprtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyprtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01e440      <unknown>
+
 cpypt [x0]!, [x1]!, x2!
+// CHECK-INST: cpypt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d013440      <unknown>
+
 cpyptwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyptwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d017440      <unknown>
+
 cpyptrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyptrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01b440      <unknown>
+
 cpyptn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyptn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d01f440      <unknown>
 
-// CHECK:      [0x40,0x04,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x44,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x84,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xc4,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x14,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x54,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x94,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xd4,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x24,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x64,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xa4,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xe4,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x34,0x41,0x1d]
-// CHECK-NEXT: [0x40,0x74,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xb4,0x41,0x1d]
-// CHECK-NEXT: [0x40,0xf4,0x41,0x1d]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 cpym [x0]!, [x1]!, x2!
+// CHECK-INST: cpym [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d410440      <unknown>
+
 cpymwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d414440      <unknown>
+
 cpymrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d418440      <unknown>
+
 cpymn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41c440      <unknown>
+
 cpymwt [x0]!, [x1]!, x2!
+// CHECK-INST: cpymwt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d411440      <unknown>
+
 cpymwtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymwtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d415440      <unknown>
+
 cpymwtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymwtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d419440      <unknown>
+
 cpymwtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymwtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41d440      <unknown>
+
 cpymrt [x0]!, [x1]!, x2!
+// CHECK-INST: cpymrt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d412440      <unknown>
+
 cpymrtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymrtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d416440      <unknown>
+
 cpymrtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymrtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41a440      <unknown>
+
 cpymrtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymrtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41e440      <unknown>
+
 cpymt [x0]!, [x1]!, x2!
+// CHECK-INST: cpymt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d413440      <unknown>
+
 cpymtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d417440      <unknown>
+
 cpymtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41b440      <unknown>
+
 cpymtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpymtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d41f440      <unknown>
 
-// CHECK:      [0x40,0x04,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x44,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x84,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xc4,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x14,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x54,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x94,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xd4,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x24,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x64,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xa4,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xe4,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x34,0x81,0x1d]
-// CHECK-NEXT: [0x40,0x74,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xb4,0x81,0x1d]
-// CHECK-NEXT: [0x40,0xf4,0x81,0x1d]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 cpye [x0]!, [x1]!, x2!
+// CHECK-INST: cpye [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x04,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d810440      <unknown>
+
 cpyewn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyewn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x44,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d814440      <unknown>
+
 cpyern [x0]!, [x1]!, x2!
+// CHECK-INST: cpyern [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x84,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d818440      <unknown>
+
 cpyen [x0]!, [x1]!, x2!
+// CHECK-INST: cpyen [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xc4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81c440      <unknown>
+
 cpyewt [x0]!, [x1]!, x2!
+// CHECK-INST: cpyewt [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x14,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d811440      <unknown>
+
 cpyewtwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyewtwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x54,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d815440      <unknown>
+
 cpyewtrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyewtrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x94,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d819440      <unknown>
+
 cpyewtn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyewtn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xd4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81d440      <unknown>
+
 cpyert [x0]!, [x1]!, x2!
+// CHECK-INST: cpyert [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x24,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d812440      <unknown>
+
 cpyertwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyertwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x64,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d816440      <unknown>
+
 cpyertrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyertrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xa4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81a440      <unknown>
+
 cpyertn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyertn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xe4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81e440      <unknown>
+
 cpyet [x0]!, [x1]!, x2!
+// CHECK-INST: cpyet [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x34,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d813440      <unknown>
+
 cpyetwn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyetwn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0x74,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d817440      <unknown>
+
 cpyetrn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyetrn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xb4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81b440      <unknown>
+
 cpyetn [x0]!, [x1]!, x2!
+// CHECK-INST: cpyetn [x0]!, [x1]!, x2!
+// CHECK-ENCODING: encoding: [0x40,0xf4,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d81f440      <unknown>
 
-// CHECK:      [0x20,0x04,0xc2,0x19]
-// CHECK-NEXT: [0x20,0x14,0xc2,0x19]
-// CHECK-NEXT: [0x20,0x24,0xc2,0x19]
-// CHECK-NEXT: [0x20,0x34,0xc2,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 setp [x0]!, x1!, x2
+// CHECK-INST: setp [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x04,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c20420      <unknown>
+
 setpt [x0]!, x1!, x2
+// CHECK-INST: setpt [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x14,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c21420      <unknown>
+
 setpn [x0]!, x1!, x2
+// CHECK-INST: setpn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x24,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c22420      <unknown>
+
 setptn [x0]!, x1!, x2
+// CHECK-INST: setptn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x34,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c23420      <unknown>
 
-// CHECK: [0x20,0x44,0xc2,0x19]
-// CHECK: [0x20,0x54,0xc2,0x19]
-// CHECK: [0x20,0x64,0xc2,0x19]
-// CHECK: [0x20,0x74,0xc2,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 setm [x0]!, x1!, x2
+// CHECK-INST: setm [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x44,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c24420      <unknown>
+
 setmt [x0]!, x1!, x2
+// CHECK-INST: setmt [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x54,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c25420      <unknown>
+
 setmn [x0]!, x1!, x2
+// CHECK-INST: setmn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x64,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c26420      <unknown>
+
 setmtn [x0]!, x1!, x2
+// CHECK-INST: setmtn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x74,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c27420      <unknown>
 
-// CHECK: [0x20,0x84,0xc2,0x19]
-// CHECK: [0x20,0x94,0xc2,0x19]
-// CHECK: [0x20,0xa4,0xc2,0x19]
-// CHECK: [0x20,0xb4,0xc2,0x19]
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
-// CHECK-NO-MOPS-ERR: error: instruction requires: mops
 sete [x0]!, x1!, x2
+// CHECK-INST: sete [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x84,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c28420      <unknown>
+
 setet [x0]!, x1!, x2
+// CHECK-INST: setet [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x94,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c29420      <unknown>
+
 seten [x0]!, x1!, x2
+// CHECK-INST: seten [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0xa4,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c2a420      <unknown>
+
 setetn [x0]!, x1!, x2
+// CHECK-INST: setetn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0xb4,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c2b420      <unknown>
 
-// CHECK-MTE: [0x20,0x04,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x14,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x24,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x34,0xc2,0x1d]
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
 setgp [x0]!, x1!, x2
+// CHECK-INST: setgp [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x04,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc20420      <unknown>
+
 setgpt [x0]!, x1!, x2
+// CHECK-INST: setgpt [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x14,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc21420      <unknown>
+
 setgpn [x0]!, x1!, x2
+// CHECK-INST: setgpn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x24,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc22420      <unknown>
+
 setgptn [x0]!, x1!, x2
+// CHECK-INST: setgptn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x34,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc23420      <unknown>
 
-// CHECK-MTE: [0x20,0x44,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x54,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x64,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x74,0xc2,0x1d]
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
 setgm [x0]!, x1!, x2
+// CHECK-INST: setgm [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x44,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc24420      <unknown>
+
 setgmt [x0]!, x1!, x2
+// CHECK-INST: setgmt [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x54,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc25420      <unknown>
+
 setgmn [x0]!, x1!, x2
+// CHECK-INST: setgmn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x64,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc26420      <unknown>
+
 setgmtn [x0]!, x1!, x2
+// CHECK-INST: setgmtn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x74,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc27420      <unknown>
 
-// CHECK-MTE: [0x20,0x84,0xc2,0x1d]
-// CHECK-MTE: [0x20,0x94,0xc2,0x1d]
-// CHECK-MTE: [0x20,0xa4,0xc2,0x1d]
-// CHECK-MTE: [0x20,0xb4,0xc2,0x1d]
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MTE-ERR: error: instruction requires: mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
-// CHECK-NO-MOPSMTE-ERR: error: instruction requires: mops mte
 setge [x0]!, x1!, x2
+// CHECK-INST: setge [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x84,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc28420      <unknown>
+
 setget [x0]!, x1!, x2
+// CHECK-INST: setget [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0x94,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc29420      <unknown>
+
 setgen [x0]!, x1!, x2
-setgetn [x0]!, x1!, x2
+// CHECK-INST: setgen [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0xa4,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc2a420      <unknown>
 
-// All operand must be different from each other
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpyfp [x0]!, [x0]!, x1!
-cpyfp [x0]!, [x1]!, x0!
-cpyfp [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpyfm [x0]!, [x0]!, x1!
-cpyfm [x0]!, [x1]!, x0!
-cpyfm [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpyfe [x0]!, [x0]!, x1!
-cpyfe [x0]!, [x1]!, x0!
-cpyfe [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpyp [x0]!, [x0]!, x1!
-cpyp [x0]!, [x1]!, x0!
-cpyp [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpym [x0]!, [x0]!, x1!
-cpym [x0]!, [x1]!, x0!
-cpym [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid CPY instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid CPY instruction, source and size registers are the same
-cpye [x0]!, [x0]!, x1!
-cpye [x0]!, [x1]!, x0!
-cpye [x1]!, [x0]!, x0!
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-setp [x0]!, x0!, x1
-setp [x0]!, x1!, x0
-setp [x1]!, x0!, x0
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-setm [x0]!, x0!, x1
-setm [x0]!, x1!, x0
-setm [x1]!, x0!, x0
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-sete [x0]!, x0!, x1
-sete [x0]!, x1!, x0
-sete [x1]!, x0!, x0
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-setgp [x0]!, x0!, x1
-setgp [x0]!, x1!, x0
-setgp [x1]!, x0!, x0
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-setgm [x0]!, x0!, x1
-setgm [x0]!, x1!, x0
-setgm [x1]!, x0!, x0
-
-// CHECK-ERROR: error: invalid SET instruction, destination and size registers are the same
-// CHECK-ERROR: error: invalid SET instruction, destination and source registers are the same
-// CHECK-ERROR: error: invalid SET instruction, source and size registers are the same
-setge [x0]!, x0!, x1
-setge [x0]!, x1!, x0
-setge [x1]!, x0!, x0
-
-// SP cannot be used as argument at any position
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpyfp [sp]!, [x1]!, x2!
-cpyfp [x0]!, [sp]!, x2!
-cpyfp [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpyfm [sp]!, [x1]!, x2!
-cpyfm [x0]!, [sp]!, x2!
-cpyfm [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpyfe [sp]!, [x1]!, x2!
-cpyfe [x0]!, [sp]!, x2!
-cpyfe [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpyp [sp]!, [x2]!, x2!
-cpyp [x0]!, [sp]!, x2!
-cpyp [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpym [sp]!, [x2]!, x2!
-cpym [x0]!, [sp]!, x2!
-cpym [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-cpye [sp]!, [x2]!, x2!
-cpye [x0]!, [sp]!, x2!
-cpye [x0]!, [x1]!, sp!
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-setp [sp]!, x1!, x2
-setp [x0]!, sp!, x2
-setp [x0]!, x1!, sp
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-setm [sp]!, x1!, x2
-setm [x0]!, sp!, x2
-setm [x0]!, x1!, sp
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-sete [sp]!, x1!, x2
-sete [x0]!, sp!, x2
-sete [x0]!, x1!, sp
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-setgp [sp]!, x1!, x2
-setgp [x0]!, sp!, x2
-setgp [x0]!, x1!, sp
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-setgm [sp]!, x1!, x2
-setgm [x0]!, sp!, x2
-setgm [x0]!, x1!, sp
-
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-setge [sp]!, x1!, x2
-setge [x0]!, sp!, x2
-setge [x0]!, x1!, sp
+setgetn [x0]!, x1!, x2
+// CHECK-INST: setgetn [x0]!, x1!, x2
+// CHECK-ENCODING: encoding: [0x20,0xb4,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops mte
+// CHECK-UNKNOWN:  1dc2b420      <unknown>
 
 // XZR can only be used at:
 //  - the size operand in CPY.
 //  - the size or source operands in SET.
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpyfp [x0]!, [x1]!, xzr!
-cpyfp [xzr]!, [x1]!, x2!
-cpyfp [x0]!, [xzr]!, x2!
 cpyfp [x0]!, [x1]!, xzr!
+// CHECK-INST: cpyfp [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x01,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  190107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpyfm [x0]!, [x1]!, xzr!
-cpyfm [xzr]!, [x1]!, x2!
-cpyfm [x0]!, [xzr]!, x2!
 cpyfm [x0]!, [x1]!, xzr!
+// CHECK-INST: cpyfm [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x41,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  194107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpyfe [x0]!, [x1]!, xzr!
-cpyfe [xzr]!, [x1]!, x2!
-cpyfe [x0]!, [xzr]!, x2!
 cpyfe [x0]!, [x1]!, xzr!
+// CHECK-INST: cpyfe [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x81,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  198107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpyp [x0]!, [x1]!, xzr!
-cpyp [xzr]!, [x2]!, x2!
-cpyp [x0]!, [xzr]!, x2!
 cpyp [x0]!, [x1]!, xzr!
+// CHECK-INST: cpyp [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x01,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d0107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpym [x0]!, [x1]!, xzr!
-cpym [xzr]!, [x2]!, x2!
-cpym [x0]!, [xzr]!, x2!
 cpym [x0]!, [x1]!, xzr!
+// CHECK-INST: cpym [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x41,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d4107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       cpye [x0]!, [x1]!, xzr!
-cpye [xzr]!, [x2]!, x2!
-cpye [x0]!, [xzr]!, x2!
 cpye [x0]!, [x1]!, xzr!
+// CHECK-INST: cpye [x0]!, [x1]!, xzr!
+// CHECK-ENCODING: encoding: [0xe0,0x07,0x81,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1d8107e0      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       setp [x0]!, xzr!, x2
-// CHECK:       setp [x0]!, x1!, xzr
-setp [xzr]!, x1!, x2
 setp [x0]!, xzr!, x2
+// CHECK-INST: setp [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x07,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c207e0      <unknown>
+
 setp [x0]!, x1!, xzr
+// CHECK-INST: setp [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x04,0xdf,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19df0420      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       setm [x0]!, xzr!, x2
-// CHECK:       setm [x0]!, x1!, xzr
-setm [xzr]!, x1!, x2
 setm [x0]!, xzr!, x2
+// CHECK-INST: setm [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x47,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c247e0      <unknown>
+
 setm [x0]!, x1!, xzr
+// CHECK-INST: setm [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x44,0xdf,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19df4420      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK:       sete [x0]!, xzr!, x2
-// CHECK:       sete [x0]!, x1!, xzr
-sete [xzr]!, x1!, x2
 sete [x0]!, xzr!, x2
+// CHECK-INST: sete [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x87,0xc2,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19c287e0      <unknown>
+
 sete [x0]!, x1!, xzr
+// CHECK-INST: sete [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x84,0xdf,0x19]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  19df8420      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-MTE:   setgp [x0]!, xzr!, x2
-// CHECK-MTE:   setgp [x0]!, x1!, xzr
-setgp [xzr]!, x1!, x2
 setgp [x0]!, xzr!, x2
+// CHECK-INST: setgp [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x07,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1dc207e0      <unknown>
+
 setgp [x0]!, x1!, xzr
+// CHECK-INST: setgp [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x04,0xdf,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1ddf0420      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-MTE:   setgm [x0]!, xzr!, x2
-// CHECK-MTE:   setgm [x0]!, x1!, xzr
-setgm [xzr]!, x1!, x2
 setgm [x0]!, xzr!, x2
+// CHECK-INST: setgm [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x47,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1dc247e0      <unknown>
+
 setgm [x0]!, x1!, xzr
+// CHECK-INST: setgm [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x44,0xdf,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1ddf4420      <unknown>
 
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-MTE:   setge [x0]!, xzr!, x2
-// CHECK-MTE:   setge [x0]!, x1!, xzr
-setge [xzr]!, x1!, x2
 setge [x0]!, xzr!, x2
+// CHECK-INST: setge [x0]!, xzr!, x2
+// CHECK-ENCODING: encoding: [0xe0,0x87,0xc2,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1dc287e0      <unknown>
+
 setge [x0]!, x1!, xzr
+// CHECK-INST: setge [x0]!, x1!, xzr
+// CHECK-ENCODING: encoding: [0x20,0x84,0xdf,0x1d]
+// CHECK-ERROR: error: instruction requires: mops
+// CHECK-UNKNOWN:  1ddf8420      <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.9a-ats1a.s b/llvm/test/MC/AArch64/armv8.9a-ats1a.s
index a30d206..21b960e 100644
--- a/llvm/test/MC/AArch64/armv8.9a-ats1a.s
+++ b/llvm/test/MC/AArch64/armv8.9a-ats1a.s
@@ -1,10 +1,26 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefixes=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 at s1e1a, x1
-// CHECK: at s1e1a, x1                        // encoding: [0x41,0x79,0x08,0xd5]
+// CHECK-INST: at s1e1a, x1
+// CHECK-ENCODING: encoding: [0x41,0x79,0x08,0xd5]
+// CHECK-UNKNOWN:  d5087941 at s1e1a, x1
 
 at s1e2a, x1
-// CHECK: at s1e2a, x1                        // encoding: [0x41,0x79,0x0c,0xd5]
+// CHECK-INST: at s1e2a, x1
+// CHECK-ENCODING: encoding: [0x41,0x79,0x0c,0xd5]
+// CHECK-UNKNOWN:  d50c7941 at s1e2a, x1
 
 at s1e3a, x1
-// CHECK: at s1e3a, x1                        // encoding: [0x41,0x79,0x0e,0xd5]
+// CHECK-INST: at s1e3a, x1
+// CHECK-ENCODING: encoding: [0x41,0x79,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e7941 at s1e3a, x1
diff --git a/llvm/test/MC/AArch64/armv8.9a-clrbhb.s b/llvm/test/MC/AArch64/armv8.9a-clrbhb.s
index 96de61f..9b5bfec 100644
--- a/llvm/test/MC/AArch64/armv8.9a-clrbhb.s
+++ b/llvm/test/MC/AArch64/armv8.9a-clrbhb.s
@@ -2,42 +2,56 @@
 // Assembly is always permitted for instructions in the hint space.
 
 // Optional, off by default
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8a < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.8a < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9a < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.3a < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8a < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.8a < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9a < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.3a < %s | FileCheck %s --check-prefix=HINT_22
 
 // Optional, off by default, doubly disabled
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.8a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.3a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.8a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.3a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
 
 // Optional, off by default, manually enabled
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.8a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.3a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.8a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.3a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 // Mandatory, enabled by default
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.9a < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.4a < %s | FileCheck %s --check-prefix=CLRBHB
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.9a < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.4a < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 // Mandatory, on by default, doubly enabled
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.9a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.4a,+clrbhb < %s | FileCheck %s --check-prefix=CLRBHB
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.9a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.4a,+clrbhb < %s | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 // Mandatory, can't prevent disabling in LLVM
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v8.9a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-// RUN: llvm-mc -show-encoding -triple aarch64-none-elf -mattr=+v9.4a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
-
-        clrbhb
-        hint #22
-
-// CLRBHB: clrbhb    // encoding: [0xdf,0x22,0x03,0xd5]
-// CLRBHB: clrbhb    // encoding: [0xdf,0x22,0x03,0xd5]
-// HINT_22: hint #22 // encoding: [0xdf,0x22,0x03,0xd5]
-// HINT_22: hint #22 // encoding: [0xdf,0x22,0x03,0xd5]
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v8.9a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+// RUN: llvm-mc -show-encoding -triple aarch64 -mattr=+v9.4a,-clrbhb < %s | FileCheck %s --check-prefix=HINT_22
+
+// Check Unknown
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+clrbhb < %s \
+// RUN:   | llvm-objdump -d --mattr=-clrbhb --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+clrbhb < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+clrbhb -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+clrbhb
+// HINT_22: hint #22                             // encoding: [0xdf,0x22,0x03,0xd5]
+// CHECK-INST: clrbhb
+// CHECK-ENCODING: encoding: [0xdf,0x22,0x03,0xd5]
+// CHECK-UNKNOWN:  d50322df    hint #22
+
+hint #22
+// HINT_22: hint #22                             // encoding: [0xdf,0x22,0x03,0xd5]
+// CHECK-INST: clrbhb
+// CHECK-ENCODING: encoding: [0xdf,0x22,0x03,0xd5]
+// CHECK-UNKNOWN:  d50322df    hint #22
diff --git a/llvm/test/MC/AArch64/armv8.9a-debug-pmu.s b/llvm/test/MC/AArch64/armv8.9a-debug-pmu.s
index 0b74905..db5cb07 100644
--- a/llvm/test/MC/AArch64/armv8.9a-debug-pmu.s
+++ b/llvm/test/MC/AArch64/armv8.9a-debug-pmu.s
@@ -1,485 +1,1785 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding               -mattr=+ite < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.8a -mattr=+ite < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.9a -mattr=+ite < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.3a -mattr=+ite < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.4a -mattr=+ite < %s | FileCheck %s
-
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding               < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ITE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.8a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ITE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.9a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ITE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.3a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ITE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.4a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ITE %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.8a,+ite < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.9a,+ite < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v9.3a,+ite < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v9.4a,+ite < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ite < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+ite < %s \
+// RUN:        | llvm-objdump -d --mattr=+ite --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+ite < %s \
+// RUN:   | llvm-objdump -d --mattr=-ite --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ite < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+ite -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+mrs x3, DBGBVR0_EL1
+// CHECK-INST: mrs x3, DBGBVR0_EL1
+// CHECK-ENCODING: encoding: [0x83,0x00,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300083 mrs x3, DBGBVR0_EL1
+
+msr DBGBVR0_EL1, x1
+// CHECK-INST: msr DBGBVR0_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x00,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100081 msr DBGBVR0_EL1, x1
+
+mrs x3, DBGBVR1_EL1
+// CHECK-INST: mrs x3, DBGBVR1_EL1
+// CHECK-ENCODING: encoding: [0x83,0x01,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300183 mrs x3, DBGBVR1_EL1
+
+msr DBGBVR1_EL1, x1
+// CHECK-INST: msr DBGBVR1_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x01,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100181 msr DBGBVR1_EL1, x1
+
+mrs x3, DBGBVR2_EL1
+// CHECK-INST: mrs x3, DBGBVR2_EL1
+// CHECK-ENCODING: encoding: [0x83,0x02,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300283 mrs x3, DBGBVR2_EL1
+
+msr DBGBVR2_EL1, x1
+// CHECK-INST: msr DBGBVR2_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x02,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100281 msr DBGBVR2_EL1, x1
+
+mrs x3, DBGBVR3_EL1
+// CHECK-INST: mrs x3, DBGBVR3_EL1
+// CHECK-ENCODING: encoding: [0x83,0x03,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300383 mrs x3, DBGBVR3_EL1
+
+msr DBGBVR3_EL1, x1
+// CHECK-INST: msr DBGBVR3_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x03,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100381 msr DBGBVR3_EL1, x1
+
+mrs x3, DBGBVR4_EL1
+// CHECK-INST: mrs x3, DBGBVR4_EL1
+// CHECK-ENCODING: encoding: [0x83,0x04,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300483 mrs x3, DBGBVR4_EL1
+
+msr DBGBVR4_EL1, x1
+// CHECK-INST: msr DBGBVR4_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x04,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100481 msr DBGBVR4_EL1, x1
+
+mrs x3, DBGBVR5_EL1
+// CHECK-INST: mrs x3, DBGBVR5_EL1
+// CHECK-ENCODING: encoding: [0x83,0x05,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300583 mrs x3, DBGBVR5_EL1
+
+msr DBGBVR5_EL1, x1
+// CHECK-INST: msr DBGBVR5_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x05,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100581 msr DBGBVR5_EL1, x1
+
+mrs x3, DBGBVR6_EL1
+// CHECK-INST: mrs x3, DBGBVR6_EL1
+// CHECK-ENCODING: encoding: [0x83,0x06,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300683 mrs x3, DBGBVR6_EL1
+
+msr DBGBVR6_EL1, x1
+// CHECK-INST: msr DBGBVR6_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x06,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100681 msr DBGBVR6_EL1, x1
+
+mrs x3, DBGBVR7_EL1
+// CHECK-INST: mrs x3, DBGBVR7_EL1
+// CHECK-ENCODING: encoding: [0x83,0x07,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300783 mrs x3, DBGBVR7_EL1
+
+msr DBGBVR7_EL1, x1
+// CHECK-INST: msr DBGBVR7_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x07,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100781 msr DBGBVR7_EL1, x1
+
+mrs x3, DBGBVR8_EL1
+// CHECK-INST: mrs x3, DBGBVR8_EL1
+// CHECK-ENCODING: encoding: [0x83,0x08,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300883 mrs x3, DBGBVR8_EL1
+
+msr DBGBVR8_EL1, x1
+// CHECK-INST: msr DBGBVR8_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x08,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100881 msr DBGBVR8_EL1, x1
+
+mrs x3, DBGBVR9_EL1
+// CHECK-INST: mrs x3, DBGBVR9_EL1
+// CHECK-ENCODING: encoding: [0x83,0x09,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300983 mrs x3, DBGBVR9_EL1
+
+msr DBGBVR9_EL1, x1
+// CHECK-INST: msr DBGBVR9_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x09,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100981 msr DBGBVR9_EL1, x1
+
+mrs x3, DBGBVR10_EL1
+// CHECK-INST: mrs x3, DBGBVR10_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0a,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300a83 mrs x3, DBGBVR10_EL1
+
+msr DBGBVR10_EL1, x1
+// CHECK-INST: msr DBGBVR10_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0a,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100a81 msr DBGBVR10_EL1, x1
+
+mrs x3, DBGBVR11_EL1
+// CHECK-INST: mrs x3, DBGBVR11_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0b,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300b83 mrs x3, DBGBVR11_EL1
+
+msr DBGBVR11_EL1, x1
+// CHECK-INST: msr DBGBVR11_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0b,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100b81 msr DBGBVR11_EL1, x1
+
+mrs x3, DBGBVR12_EL1
+// CHECK-INST: mrs x3, DBGBVR12_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0c,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300c83 mrs x3, DBGBVR12_EL1
+
+msr DBGBVR12_EL1, x1
+// CHECK-INST: msr DBGBVR12_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0c,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100c81 msr DBGBVR12_EL1, x1
+
+mrs x3, DBGBVR13_EL1
+// CHECK-INST: mrs x3, DBGBVR13_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300d83 mrs x3, DBGBVR13_EL1
+
+msr DBGBVR13_EL1, x1
+// CHECK-INST: msr DBGBVR13_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0d,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100d81 msr DBGBVR13_EL1, x1
+
+mrs x3, DBGBVR14_EL1
+// CHECK-INST: mrs x3, DBGBVR14_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300e83 mrs x3, DBGBVR14_EL1
+
+msr DBGBVR14_EL1, x1
+// CHECK-INST: msr DBGBVR14_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100e81 msr DBGBVR14_EL1, x1
+
+mrs x3, DBGBVR15_EL1
+// CHECK-INST: mrs x3, DBGBVR15_EL1
+// CHECK-ENCODING: encoding: [0x83,0x0f,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300f83 mrs x3, DBGBVR15_EL1
+
+msr DBGBVR15_EL1, x1
+// CHECK-INST: msr DBGBVR15_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x0f,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100f81 msr DBGBVR15_EL1, x1
+
+mrs x3, DBGBCR0_EL1
+// CHECK-INST: mrs x3, DBGBCR0_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x00,0x30,0xd5]
+// CHECK-UNKNOWN:  d53000a3 mrs x3, DBGBCR0_EL1
+
+msr DBGBCR0_EL1, x1
+// CHECK-INST: msr DBGBCR0_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x00,0x10,0xd5]
+// CHECK-UNKNOWN:  d51000a1 msr DBGBCR0_EL1, x1
+
+mrs x3, DBGBCR1_EL1
+// CHECK-INST: mrs x3, DBGBCR1_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x01,0x30,0xd5]
+// CHECK-UNKNOWN:  d53001a3 mrs x3, DBGBCR1_EL1
+
+msr DBGBCR1_EL1, x1
+// CHECK-INST: msr DBGBCR1_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x01,0x10,0xd5]
+// CHECK-UNKNOWN:  d51001a1 msr DBGBCR1_EL1, x1
+
+mrs x3, DBGBCR2_EL1
+// CHECK-INST: mrs x3, DBGBCR2_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x02,0x30,0xd5]
+// CHECK-UNKNOWN:  d53002a3 mrs x3, DBGBCR2_EL1
+
+msr DBGBCR2_EL1, x1
+// CHECK-INST: msr DBGBCR2_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x02,0x10,0xd5]
+// CHECK-UNKNOWN:  d51002a1 msr DBGBCR2_EL1, x1
+
+mrs x3, DBGBCR3_EL1
+// CHECK-INST: mrs x3, DBGBCR3_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x03,0x30,0xd5]
+// CHECK-UNKNOWN:  d53003a3 mrs x3, DBGBCR3_EL1
+
+msr DBGBCR3_EL1, x1
+// CHECK-INST: msr DBGBCR3_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x03,0x10,0xd5]
+// CHECK-UNKNOWN:  d51003a1 msr DBGBCR3_EL1, x1
+
+mrs x3, DBGBCR4_EL1
+// CHECK-INST: mrs x3, DBGBCR4_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x04,0x30,0xd5]
+// CHECK-UNKNOWN:  d53004a3 mrs x3, DBGBCR4_EL1
+
+msr DBGBCR4_EL1, x1
+// CHECK-INST: msr DBGBCR4_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x04,0x10,0xd5]
+// CHECK-UNKNOWN:  d51004a1 msr DBGBCR4_EL1, x1
+
+mrs x3, DBGBCR5_EL1
+// CHECK-INST: mrs x3, DBGBCR5_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x05,0x30,0xd5]
+// CHECK-UNKNOWN:  d53005a3 mrs x3, DBGBCR5_EL1
+
+msr DBGBCR5_EL1, x1
+// CHECK-INST: msr DBGBCR5_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x05,0x10,0xd5]
+// CHECK-UNKNOWN:  d51005a1 msr DBGBCR5_EL1, x1
+
+mrs x3, DBGBCR6_EL1
+// CHECK-INST: mrs x3, DBGBCR6_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x06,0x30,0xd5]
+// CHECK-UNKNOWN:  d53006a3 mrs x3, DBGBCR6_EL1
+
+msr DBGBCR6_EL1, x1
+// CHECK-INST: msr DBGBCR6_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x06,0x10,0xd5]
+// CHECK-UNKNOWN:  d51006a1 msr DBGBCR6_EL1, x1
+
+mrs x3, DBGBCR7_EL1
+// CHECK-INST: mrs x3, DBGBCR7_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x07,0x30,0xd5]
+// CHECK-UNKNOWN:  d53007a3 mrs x3, DBGBCR7_EL1
+
+msr DBGBCR7_EL1, x1
+// CHECK-INST: msr DBGBCR7_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x07,0x10,0xd5]
+// CHECK-UNKNOWN:  d51007a1 msr DBGBCR7_EL1, x1
+
+mrs x3, DBGBCR8_EL1
+// CHECK-INST: mrs x3, DBGBCR8_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x08,0x30,0xd5]
+// CHECK-UNKNOWN:  d53008a3 mrs x3, DBGBCR8_EL1
+
+msr DBGBCR8_EL1, x1
+// CHECK-INST: msr DBGBCR8_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x08,0x10,0xd5]
+// CHECK-UNKNOWN:  d51008a1 msr DBGBCR8_EL1, x1
+
+mrs x3, DBGBCR9_EL1
+// CHECK-INST: mrs x3, DBGBCR9_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x09,0x30,0xd5]
+// CHECK-UNKNOWN:  d53009a3 mrs x3, DBGBCR9_EL1
+
+msr DBGBCR9_EL1, x1
+// CHECK-INST: msr DBGBCR9_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x09,0x10,0xd5]
+// CHECK-UNKNOWN:  d51009a1 msr DBGBCR9_EL1, x1
+
+mrs x3, DBGBCR10_EL1
+// CHECK-INST: mrs x3, DBGBCR10_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0a,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300aa3 mrs x3, DBGBCR10_EL1
+
+msr DBGBCR10_EL1, x1
+// CHECK-INST: msr DBGBCR10_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0a,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100aa1 msr DBGBCR10_EL1, x1
+
+mrs x3, DBGBCR11_EL1
+// CHECK-INST: mrs x3, DBGBCR11_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0b,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ba3 mrs x3, DBGBCR11_EL1
+
+msr DBGBCR11_EL1, x1
+// CHECK-INST: msr DBGBCR11_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0b,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ba1 msr DBGBCR11_EL1, x1
+
+mrs x3, DBGBCR12_EL1
+// CHECK-INST: mrs x3, DBGBCR12_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0c,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ca3 mrs x3, DBGBCR12_EL1
+
+msr DBGBCR12_EL1, x1
+// CHECK-INST: msr DBGBCR12_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0c,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ca1 msr DBGBCR12_EL1, x1
+
+mrs x3, DBGBCR13_EL1
+// CHECK-INST: mrs x3, DBGBCR13_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300da3 mrs x3, DBGBCR13_EL1
+
+msr DBGBCR13_EL1, x1
+// CHECK-INST: msr DBGBCR13_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0d,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100da1 msr DBGBCR13_EL1, x1
+
+mrs x3, DBGBCR14_EL1
+// CHECK-INST: mrs x3, DBGBCR14_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ea3 mrs x3, DBGBCR14_EL1
+
+msr DBGBCR14_EL1, x1
+// CHECK-INST: msr DBGBCR14_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ea1 msr DBGBCR14_EL1, x1
+
+mrs x3, DBGBCR15_EL1
+// CHECK-INST: mrs x3, DBGBCR15_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x0f,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300fa3 mrs x3, DBGBCR15_EL1
+
+msr DBGBCR15_EL1, x1
+// CHECK-INST: msr DBGBCR15_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x0f,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100fa1 msr DBGBCR15_EL1, x1
+
+mrs x3, DBGWVR0_EL1
+// CHECK-INST: mrs x3, DBGWVR0_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x00,0x30,0xd5]
+// CHECK-UNKNOWN:  d53000c3 mrs x3, DBGWVR0_EL1
+
+msr DBGWVR0_EL1, x1
+// CHECK-INST: msr DBGWVR0_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x00,0x10,0xd5]
+// CHECK-UNKNOWN:  d51000c1 msr DBGWVR0_EL1, x1
+
+mrs x3, DBGWVR1_EL1
+// CHECK-INST: mrs x3, DBGWVR1_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x01,0x30,0xd5]
+// CHECK-UNKNOWN:  d53001c3 mrs x3, DBGWVR1_EL1
+
+msr DBGWVR1_EL1, x1
+// CHECK-INST: msr DBGWVR1_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x01,0x10,0xd5]
+// CHECK-UNKNOWN:  d51001c1 msr DBGWVR1_EL1, x1
+
+mrs x3, DBGWVR2_EL1
+// CHECK-INST: mrs x3, DBGWVR2_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x02,0x30,0xd5]
+// CHECK-UNKNOWN:  d53002c3 mrs x3, DBGWVR2_EL1
+
+msr DBGWVR2_EL1, x1
+// CHECK-INST: msr DBGWVR2_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x02,0x10,0xd5]
+// CHECK-UNKNOWN:  d51002c1 msr DBGWVR2_EL1, x1
+
+mrs x3, DBGWVR3_EL1
+// CHECK-INST: mrs x3, DBGWVR3_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x03,0x30,0xd5]
+// CHECK-UNKNOWN:  d53003c3 mrs x3, DBGWVR3_EL1
+
+msr DBGWVR3_EL1, x1
+// CHECK-INST: msr DBGWVR3_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x03,0x10,0xd5]
+// CHECK-UNKNOWN:  d51003c1 msr DBGWVR3_EL1, x1
+
+mrs x3, DBGWVR4_EL1
+// CHECK-INST: mrs x3, DBGWVR4_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x04,0x30,0xd5]
+// CHECK-UNKNOWN:  d53004c3 mrs x3, DBGWVR4_EL1
+
+msr DBGWVR4_EL1, x1
+// CHECK-INST: msr DBGWVR4_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x04,0x10,0xd5]
+// CHECK-UNKNOWN:  d51004c1 msr DBGWVR4_EL1, x1
+
+mrs x3, DBGWVR5_EL1
+// CHECK-INST: mrs x3, DBGWVR5_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x05,0x30,0xd5]
+// CHECK-UNKNOWN:  d53005c3 mrs x3, DBGWVR5_EL1
+
+msr DBGWVR5_EL1, x1
+// CHECK-INST: msr DBGWVR5_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x05,0x10,0xd5]
+// CHECK-UNKNOWN:  d51005c1 msr DBGWVR5_EL1, x1
+
+mrs x3, DBGWVR6_EL1
+// CHECK-INST: mrs x3, DBGWVR6_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x06,0x30,0xd5]
+// CHECK-UNKNOWN:  d53006c3 mrs x3, DBGWVR6_EL1
+
+msr DBGWVR6_EL1, x1
+// CHECK-INST: msr DBGWVR6_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x06,0x10,0xd5]
+// CHECK-UNKNOWN:  d51006c1 msr DBGWVR6_EL1, x1
+
+mrs x3, DBGWVR7_EL1
+// CHECK-INST: mrs x3, DBGWVR7_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x07,0x30,0xd5]
+// CHECK-UNKNOWN:  d53007c3 mrs x3, DBGWVR7_EL1
+
+msr DBGWVR7_EL1, x1
+// CHECK-INST: msr DBGWVR7_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x07,0x10,0xd5]
+// CHECK-UNKNOWN:  d51007c1 msr DBGWVR7_EL1, x1
+
+mrs x3, DBGWVR8_EL1
+// CHECK-INST: mrs x3, DBGWVR8_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x08,0x30,0xd5]
+// CHECK-UNKNOWN:  d53008c3 mrs x3, DBGWVR8_EL1
+
+msr DBGWVR8_EL1, x1
+// CHECK-INST: msr DBGWVR8_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x08,0x10,0xd5]
+// CHECK-UNKNOWN:  d51008c1 msr DBGWVR8_EL1, x1
+
+mrs x3, DBGWVR9_EL1
+// CHECK-INST: mrs x3, DBGWVR9_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x09,0x30,0xd5]
+// CHECK-UNKNOWN:  d53009c3 mrs x3, DBGWVR9_EL1
+
+msr DBGWVR9_EL1, x1
+// CHECK-INST: msr DBGWVR9_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x09,0x10,0xd5]
+// CHECK-UNKNOWN:  d51009c1 msr DBGWVR9_EL1, x1
+
+mrs x3, DBGWVR10_EL1
+// CHECK-INST: mrs x3, DBGWVR10_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0a,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ac3 mrs x3, DBGWVR10_EL1
+
+msr DBGWVR10_EL1, x1
+// CHECK-INST: msr DBGWVR10_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0a,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ac1 msr DBGWVR10_EL1, x1
+
+mrs x3, DBGWVR11_EL1
+// CHECK-INST: mrs x3, DBGWVR11_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0b,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300bc3 mrs x3, DBGWVR11_EL1
+
+msr DBGWVR11_EL1, x1
+// CHECK-INST: msr DBGWVR11_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0b,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100bc1 msr DBGWVR11_EL1, x1
+
+mrs x3, DBGWVR12_EL1
+// CHECK-INST: mrs x3, DBGWVR12_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0c,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300cc3 mrs x3, DBGWVR12_EL1
+
+msr DBGWVR12_EL1, x1
+// CHECK-INST: msr DBGWVR12_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0c,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100cc1 msr DBGWVR12_EL1, x1
+
+mrs x3, DBGWVR13_EL1
+// CHECK-INST: mrs x3, DBGWVR13_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300dc3 mrs x3, DBGWVR13_EL1
+
+msr DBGWVR13_EL1, x1
+// CHECK-INST: msr DBGWVR13_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0d,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100dc1 msr DBGWVR13_EL1, x1
+
+mrs x3, DBGWVR14_EL1
+// CHECK-INST: mrs x3, DBGWVR14_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ec3 mrs x3, DBGWVR14_EL1
+
+msr DBGWVR14_EL1, x1
+// CHECK-INST: msr DBGWVR14_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ec1 msr DBGWVR14_EL1, x1
+
+mrs x3, DBGWVR15_EL1
+// CHECK-INST: mrs x3, DBGWVR15_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x0f,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300fc3 mrs x3, DBGWVR15_EL1
+
+msr DBGWVR15_EL1, x1
+// CHECK-INST: msr DBGWVR15_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0x0f,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100fc1 msr DBGWVR15_EL1, x1
+
+mrs x3, DBGWCR0_EL1
+// CHECK-INST: mrs x3, DBGWCR0_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x00,0x30,0xd5]
+// CHECK-UNKNOWN:  d53000e3 mrs x3, DBGWCR0_EL1
+
+msr DBGWCR0_EL1, x1
+// CHECK-INST: msr DBGWCR0_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x00,0x10,0xd5]
+// CHECK-UNKNOWN:  d51000e1 msr DBGWCR0_EL1, x1
+
+mrs x3, DBGWCR1_EL1
+// CHECK-INST: mrs x3, DBGWCR1_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x01,0x30,0xd5]
+// CHECK-UNKNOWN:  d53001e3 mrs x3, DBGWCR1_EL1
+
+msr DBGWCR1_EL1, x1
+// CHECK-INST: msr DBGWCR1_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x01,0x10,0xd5]
+// CHECK-UNKNOWN:  d51001e1 msr DBGWCR1_EL1, x1
+
+mrs x3, DBGWCR2_EL1
+// CHECK-INST: mrs x3, DBGWCR2_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x02,0x30,0xd5]
+// CHECK-UNKNOWN:  d53002e3 mrs x3, DBGWCR2_EL1
+
+msr DBGWCR2_EL1, x1
+// CHECK-INST: msr DBGWCR2_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x02,0x10,0xd5]
+// CHECK-UNKNOWN:  d51002e1 msr DBGWCR2_EL1, x1
+
+mrs x3, DBGWCR3_EL1
+// CHECK-INST: mrs x3, DBGWCR3_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x03,0x30,0xd5]
+// CHECK-UNKNOWN:  d53003e3 mrs x3, DBGWCR3_EL1
+
+msr DBGWCR3_EL1, x1
+// CHECK-INST: msr DBGWCR3_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x03,0x10,0xd5]
+// CHECK-UNKNOWN:  d51003e1 msr DBGWCR3_EL1, x1
+
+mrs x3, DBGWCR4_EL1
+// CHECK-INST: mrs x3, DBGWCR4_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x04,0x30,0xd5]
+// CHECK-UNKNOWN:  d53004e3 mrs x3, DBGWCR4_EL1
+
+msr DBGWCR4_EL1, x1
+// CHECK-INST: msr DBGWCR4_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x04,0x10,0xd5]
+// CHECK-UNKNOWN:  d51004e1 msr DBGWCR4_EL1, x1
+
+mrs x3, DBGWCR5_EL1
+// CHECK-INST: mrs x3, DBGWCR5_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x05,0x30,0xd5]
+// CHECK-UNKNOWN:  d53005e3 mrs x3, DBGWCR5_EL1
+
+msr DBGWCR5_EL1, x1
+// CHECK-INST: msr DBGWCR5_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x05,0x10,0xd5]
+// CHECK-UNKNOWN:  d51005e1 msr DBGWCR5_EL1, x1
+
+mrs x3, DBGWCR6_EL1
+// CHECK-INST: mrs x3, DBGWCR6_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x06,0x30,0xd5]
+// CHECK-UNKNOWN:  d53006e3 mrs x3, DBGWCR6_EL1
+
+msr DBGWCR6_EL1, x1
+// CHECK-INST: msr DBGWCR6_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x06,0x10,0xd5]
+// CHECK-UNKNOWN:  d51006e1 msr DBGWCR6_EL1, x1
+
+mrs x3, DBGWCR7_EL1
+// CHECK-INST: mrs x3, DBGWCR7_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x07,0x30,0xd5]
+// CHECK-UNKNOWN:  d53007e3 mrs x3, DBGWCR7_EL1
+
+msr DBGWCR7_EL1, x1
+// CHECK-INST: msr DBGWCR7_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x07,0x10,0xd5]
+// CHECK-UNKNOWN:  d51007e1 msr DBGWCR7_EL1, x1
+
+mrs x3, DBGWCR8_EL1
+// CHECK-INST: mrs x3, DBGWCR8_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x08,0x30,0xd5]
+// CHECK-UNKNOWN:  d53008e3 mrs x3, DBGWCR8_EL1
+
+msr DBGWCR8_EL1, x1
+// CHECK-INST: msr DBGWCR8_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x08,0x10,0xd5]
+// CHECK-UNKNOWN:  d51008e1 msr DBGWCR8_EL1, x1
+
+mrs x3, DBGWCR9_EL1
+// CHECK-INST: mrs x3, DBGWCR9_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x09,0x30,0xd5]
+// CHECK-UNKNOWN:  d53009e3 mrs x3, DBGWCR9_EL1
+
+msr DBGWCR9_EL1, x1
+// CHECK-INST: msr DBGWCR9_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x09,0x10,0xd5]
+// CHECK-UNKNOWN:  d51009e1 msr DBGWCR9_EL1, x1
+
+mrs x3, DBGWCR10_EL1
+// CHECK-INST: mrs x3, DBGWCR10_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0a,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ae3 mrs x3, DBGWCR10_EL1
+
+msr DBGWCR10_EL1, x1
+// CHECK-INST: msr DBGWCR10_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0a,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ae1 msr DBGWCR10_EL1, x1
+
+mrs x3, DBGWCR11_EL1
+// CHECK-INST: mrs x3, DBGWCR11_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0b,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300be3 mrs x3, DBGWCR11_EL1
+
+msr DBGWCR11_EL1, x1
+// CHECK-INST: msr DBGWCR11_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0b,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100be1 msr DBGWCR11_EL1, x1
+
+mrs x3, DBGWCR12_EL1
+// CHECK-INST: mrs x3, DBGWCR12_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0c,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ce3 mrs x3, DBGWCR12_EL1
+
+msr DBGWCR12_EL1, x1
+// CHECK-INST: msr DBGWCR12_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0c,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ce1 msr DBGWCR12_EL1, x1
+
+mrs x3, DBGWCR13_EL1
+// CHECK-INST: mrs x3, DBGWCR13_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300de3 mrs x3, DBGWCR13_EL1
+
+msr DBGWCR13_EL1, x1
+// CHECK-INST: msr DBGWCR13_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0d,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100de1 msr DBGWCR13_EL1, x1
+
+mrs x3, DBGWCR14_EL1
+// CHECK-INST: mrs x3, DBGWCR14_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300ee3 mrs x3, DBGWCR14_EL1
+
+msr DBGWCR14_EL1, x1
+// CHECK-INST: msr DBGWCR14_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100ee1 msr DBGWCR14_EL1, x1
+
+mrs x3, DBGWCR15_EL1
+// CHECK-INST: mrs x3, DBGWCR15_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x0f,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300fe3 mrs x3, DBGWCR15_EL1
+
+msr DBGWCR15_EL1, x1
+// CHECK-INST: msr DBGWCR15_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x0f,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100fe1 msr DBGWCR15_EL1, x1
 
 // FEAT_DEBUGv8p9
-            mrs	x3, MDSELR_EL1
-// CHECK:   mrs	x3, MDSELR_EL1                  // encoding: [0x43,0x04,0x30,0xd5]
-            msr MDSELR_EL1, x1
-// CHECK:   msr	MDSELR_EL1, x1                  // encoding: [0x41,0x04,0x10,0xd5]
+mrs x3, MDSELR_EL1
+// CHECK-INST: mrs x3, MDSELR_EL1
+// CHECK-ENCODING: encoding: [0x43,0x04,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300443      mrs x3, MDSELR_EL1
+
+msr MDSELR_EL1, x1
+// CHECK-INST: msr MDSELR_EL1, x1
+// CHECK-ENCODING: encoding: [0x41,0x04,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100441      msr MDSELR_EL1, x1
 
 // FEAT_PMUv3p9
-            mrs	x3, PMUACR_EL1
-// CHECK:   mrs	x3, PMUACR_EL1                  // encoding: [0x83,0x9e,0x38,0xd5]
-            msr	PMUACR_EL1, x1
-// CHECK:   msr	PMUACR_EL1, x1                  // encoding: [0x81,0x9e,0x18,0xd5]
+mrs x3, PMUACR_EL1
+// CHECK-INST: mrs x3, PMUACR_EL1
+// CHECK-ENCODING: encoding: [0x83,0x9e,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389e83      mrs x3, PMUACR_EL1
+
+msr PMUACR_EL1, x1
+// CHECK-INST: msr PMUACR_EL1, x1
+// CHECK-ENCODING: encoding: [0x81,0x9e,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189e81      msr PMUACR_EL1, x1
 
 // FEAT_PMUv3_SS
-            mrs	x3, PMCCNTSVR_EL1
-// CHECK:   mrs	x3, PMCCNTSVR_EL1               // encoding: [0xe3,0xeb,0x30,0xd5]
-            mrs	x3, PMICNTSVR_EL1
-// CHECK:   mrs	x3, PMICNTSVR_EL1               // encoding: [0x03,0xec,0x30,0xd5]
-            mrs	x3, PMSSCR_EL1
-// CHECK:   mrs	x3, PMSSCR_EL1                  // encoding: [0x63,0x9d,0x38,0xd5]
-            msr	PMSSCR_EL1, x1
-// CHECK:   msr	PMSSCR_EL1, x1                  // encoding: [0x61,0x9d,0x18,0xd5]
-            mrs	x3, PMEVCNTSVR0_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR0_EL1             // encoding: [0x03,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR1_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR1_EL1             // encoding: [0x23,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR2_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR2_EL1             // encoding: [0x43,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR3_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR3_EL1             // encoding: [0x63,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR4_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR4_EL1             // encoding: [0x83,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR5_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR5_EL1             // encoding: [0xa3,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR6_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR6_EL1             // encoding: [0xc3,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR7_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR7_EL1             // encoding: [0xe3,0xe8,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR8_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR8_EL1             // encoding: [0x03,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR9_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR9_EL1             // encoding: [0x23,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR10_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR10_EL1            // encoding: [0x43,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR11_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR11_EL1            // encoding: [0x63,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR12_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR12_EL1            // encoding: [0x83,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR13_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR13_EL1            // encoding: [0xa3,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR14_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR14_EL1            // encoding: [0xc3,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR15_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR15_EL1            // encoding: [0xe3,0xe9,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR16_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR16_EL1            // encoding: [0x03,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR17_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR17_EL1            // encoding: [0x23,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR18_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR18_EL1            // encoding: [0x43,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR19_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR19_EL1            // encoding: [0x63,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR20_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR20_EL1            // encoding: [0x83,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR21_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR21_EL1            // encoding: [0xa3,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR22_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR22_EL1            // encoding: [0xc3,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR23_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR23_EL1            // encoding: [0xe3,0xea,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR24_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR24_EL1            // encoding: [0x03,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR25_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR25_EL1            // encoding: [0x23,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR26_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR26_EL1            // encoding: [0x43,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR27_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR27_EL1            // encoding: [0x63,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR28_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR28_EL1            // encoding: [0x83,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR29_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR29_EL1            // encoding: [0xa3,0xeb,0x30,0xd5]
-            mrs	x3, PMEVCNTSVR30_EL1
-// CHECK:   mrs	x3, PMEVCNTSVR30_EL1            // encoding: [0xc3,0xeb,0x30,0xd5]
+mrs x3, PMCCNTSVR_EL1
+// CHECK-INST: mrs x3, PMCCNTSVR_EL1
+// CHECK-ENCODING: encoding: [0xe3,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ebe3      mrs x3, PMCCNTSVR_EL1
+
+mrs x3, PMICNTSVR_EL1
+// CHECK-INST: mrs x3, PMICNTSVR_EL1
+// CHECK-ENCODING: encoding: [0x03,0xec,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ec03      mrs x3, PMICNTSVR_EL1
+
+mrs x3, PMSSCR_EL1
+// CHECK-INST: mrs x3, PMSSCR_EL1
+// CHECK-ENCODING: encoding: [0x63,0x9d,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389d63      mrs x3, PMSSCR_EL1
+
+msr PMSSCR_EL1, x1
+// CHECK-INST: msr PMSSCR_EL1, x1
+// CHECK-ENCODING: encoding: [0x61,0x9d,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189d61      msr PMSSCR_EL1, x1
+
+mrs x3, PMEVCNTSVR0_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR0_EL1
+// CHECK-ENCODING: encoding: [0x03,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e803      mrs x3, PMEVCNTSVR0_EL1
+
+mrs x3, PMEVCNTSVR1_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR1_EL1
+// CHECK-ENCODING: encoding: [0x23,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e823      mrs x3, PMEVCNTSVR1_EL1
+
+mrs x3, PMEVCNTSVR2_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR2_EL1
+// CHECK-ENCODING: encoding: [0x43,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e843      mrs x3, PMEVCNTSVR2_EL1
+
+mrs x3, PMEVCNTSVR3_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR3_EL1
+// CHECK-ENCODING: encoding: [0x63,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e863      mrs x3, PMEVCNTSVR3_EL1
+
+mrs x3, PMEVCNTSVR4_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR4_EL1
+// CHECK-ENCODING: encoding: [0x83,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e883      mrs x3, PMEVCNTSVR4_EL1
+
+mrs x3, PMEVCNTSVR5_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR5_EL1
+// CHECK-ENCODING: encoding: [0xa3,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e8a3      mrs x3, PMEVCNTSVR5_EL1
+
+mrs x3, PMEVCNTSVR6_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR6_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e8c3      mrs x3, PMEVCNTSVR6_EL1
+
+mrs x3, PMEVCNTSVR7_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR7_EL1
+// CHECK-ENCODING: encoding: [0xe3,0xe8,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e8e3      mrs x3, PMEVCNTSVR7_EL1
+
+mrs x3, PMEVCNTSVR8_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR8_EL1
+// CHECK-ENCODING: encoding: [0x03,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e903      mrs x3, PMEVCNTSVR8_EL1
+
+mrs x3, PMEVCNTSVR9_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR9_EL1
+// CHECK-ENCODING: encoding: [0x23,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e923      mrs x3, PMEVCNTSVR9_EL1
+
+mrs x3, PMEVCNTSVR10_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR10_EL1
+// CHECK-ENCODING: encoding: [0x43,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e943      mrs x3, PMEVCNTSVR10_EL1
+
+mrs x3, PMEVCNTSVR11_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR11_EL1
+// CHECK-ENCODING: encoding: [0x63,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e963      mrs x3, PMEVCNTSVR11_EL1
+
+mrs x3, PMEVCNTSVR12_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR12_EL1
+// CHECK-ENCODING: encoding: [0x83,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e983      mrs x3, PMEVCNTSVR12_EL1
+
+mrs x3, PMEVCNTSVR13_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR13_EL1
+// CHECK-ENCODING: encoding: [0xa3,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e9a3      mrs x3, PMEVCNTSVR13_EL1
+
+mrs x3, PMEVCNTSVR14_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR14_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e9c3      mrs x3, PMEVCNTSVR14_EL1
+
+mrs x3, PMEVCNTSVR15_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR15_EL1
+// CHECK-ENCODING: encoding: [0xe3,0xe9,0x30,0xd5]
+// CHECK-UNKNOWN:  d530e9e3      mrs x3, PMEVCNTSVR15_EL1
+
+mrs x3, PMEVCNTSVR16_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR16_EL1
+// CHECK-ENCODING: encoding: [0x03,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ea03      mrs x3, PMEVCNTSVR16_EL1
+
+mrs x3, PMEVCNTSVR17_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR17_EL1
+// CHECK-ENCODING: encoding: [0x23,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ea23      mrs x3, PMEVCNTSVR17_EL1
+
+mrs x3, PMEVCNTSVR18_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR18_EL1
+// CHECK-ENCODING: encoding: [0x43,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ea43      mrs x3, PMEVCNTSVR18_EL1
+
+mrs x3, PMEVCNTSVR19_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR19_EL1
+// CHECK-ENCODING: encoding: [0x63,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ea63      mrs x3, PMEVCNTSVR19_EL1
+
+mrs x3, PMEVCNTSVR20_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR20_EL1
+// CHECK-ENCODING: encoding: [0x83,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ea83      mrs x3, PMEVCNTSVR20_EL1
+
+mrs x3, PMEVCNTSVR21_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR21_EL1
+// CHECK-ENCODING: encoding: [0xa3,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eaa3      mrs x3, PMEVCNTSVR21_EL1
+
+mrs x3, PMEVCNTSVR22_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR22_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eac3      mrs x3, PMEVCNTSVR22_EL1
+
+mrs x3, PMEVCNTSVR23_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR23_EL1
+// CHECK-ENCODING: encoding: [0xe3,0xea,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eae3      mrs x3, PMEVCNTSVR23_EL1
+
+mrs x3, PMEVCNTSVR24_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR24_EL1
+// CHECK-ENCODING: encoding: [0x03,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eb03      mrs x3, PMEVCNTSVR24_EL1
+
+mrs x3, PMEVCNTSVR25_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR25_EL1
+// CHECK-ENCODING: encoding: [0x23,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eb23      mrs x3, PMEVCNTSVR25_EL1
+
+mrs x3, PMEVCNTSVR26_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR26_EL1
+// CHECK-ENCODING: encoding: [0x43,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eb43      mrs x3, PMEVCNTSVR26_EL1
+
+mrs x3, PMEVCNTSVR27_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR27_EL1
+// CHECK-ENCODING: encoding: [0x63,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eb63      mrs x3, PMEVCNTSVR27_EL1
+
+mrs x3, PMEVCNTSVR28_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR28_EL1
+// CHECK-ENCODING: encoding: [0x83,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eb83      mrs x3, PMEVCNTSVR28_EL1
+
+mrs x3, PMEVCNTSVR29_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR29_EL1
+// CHECK-ENCODING: encoding: [0xa3,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530eba3      mrs x3, PMEVCNTSVR29_EL1
+
+mrs x3, PMEVCNTSVR30_EL1
+// CHECK-INST: mrs x3, PMEVCNTSVR30_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xeb,0x30,0xd5]
+// CHECK-UNKNOWN:  d530ebc3      mrs x3, PMEVCNTSVR30_EL1
 
 // FEAT_PMUv3_ICNTR
-            mrs x3, PMICNTR_EL0
-// CHECK:   mrs x3, PMICNTR_EL0                 // encoding: [0x03,0x94,0x3b,0xd5]
-            msr PMICNTR_EL0, x3
-// CHECK:   msr PMICNTR_EL0, x3                 // encoding: [0x03,0x94,0x1b,0xd5]
-            mrs x3, PMICFILTR_EL0
-// CHECK:   mrs x3, PMICFILTR_EL0               // encoding: [0x03,0x96,0x3b,0xd5]
-            msr PMICFILTR_EL0, x3
-// CHECK:   msr PMICFILTR_EL0, x3               // encoding: [0x03,0x96,0x1b,0xd5]
+mrs x3, PMICNTR_EL0
+// CHECK-INST: mrs x3, PMICNTR_EL0
+// CHECK-ENCODING: encoding: [0x03,0x94,0x3b,0xd5]
+// CHECK-UNKNOWN:  d53b9403      mrs x3, PMICNTR_EL0
+
+msr PMICNTR_EL0, x3
+// CHECK-INST: msr PMICNTR_EL0, x3
+// CHECK-ENCODING: encoding: [0x03,0x94,0x1b,0xd5]
+// CHECK-UNKNOWN:  d51b9403      msr PMICNTR_EL0, x3
+
+mrs x3, PMICFILTR_EL0
+// CHECK-INST: mrs x3, PMICFILTR_EL0
+// CHECK-ENCODING: encoding: [0x03,0x96,0x3b,0xd5]
+// CHECK-UNKNOWN:  d53b9603      mrs x3, PMICFILTR_EL0
+
+msr PMICFILTR_EL0, x3
+// CHECK-INST: msr PMICFILTR_EL0, x3
+// CHECK-ENCODING: encoding: [0x03,0x96,0x1b,0xd5]
+// CHECK-UNKNOWN:  d51b9603      msr PMICFILTR_EL0, x3
 
 // FEAT_PMUv3p9/FEAT_PMUV3_ICNTR
-            msr PMZR_EL0, x3
-// CHECK:   msr PMZR_EL0, x3                    // encoding: [0x83,0x9d,0x1b,0xd5]
+msr PMZR_EL0, x3
+// CHECK-INST: msr PMZR_EL0, x3
+// CHECK-ENCODING: encoding: [0x83,0x9d,0x1b,0xd5]
+// CHECK-UNKNOWN:  d51b9d83      msr PMZR_EL0, x3
 
 // FEAT_SEBEP
-            mrs	x3, PMECR_EL1
-// CHECK:   mrs	x3, PMECR_EL1                   // encoding: [0xa3,0x9e,0x38,0xd5]
-            msr	PMECR_EL1, x1
-// CHECK:   msr	PMECR_EL1, x1                   // encoding: [0xa1,0x9e,0x18,0xd5]
-            mrs	x3, PMIAR_EL1
-// CHECK:   mrs	x3, PMIAR_EL1                   // encoding: [0xe3,0x9e,0x38,0xd5]
-            msr	PMIAR_EL1, x1
-// CHECK:   msr	PMIAR_EL1, x1                   // encoding: [0xe1,0x9e,0x18,0xd5]
+mrs x3, PMECR_EL1
+// CHECK-INST: mrs x3, PMECR_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x9e,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389ea3      mrs x3, PMECR_EL1
+
+msr PMECR_EL1, x1
+// CHECK-INST: msr PMECR_EL1, x1
+// CHECK-ENCODING: encoding: [0xa1,0x9e,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189ea1      msr PMECR_EL1, x1
+
+mrs x3, PMIAR_EL1
+// CHECK-INST: mrs x3, PMIAR_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x9e,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389ee3      mrs x3, PMIAR_EL1
+
+msr PMIAR_EL1, x1
+// CHECK-INST: msr PMIAR_EL1, x1
+// CHECK-ENCODING: encoding: [0xe1,0x9e,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189ee1      msr PMIAR_EL1, x1
 
 // FEAT_SPMU
-            mrs	x3, SPMACCESSR_EL1
-// CHECK:   mrs	x3, SPMACCESSR_EL1              // encoding: [0x63,0x9d,0x30,0xd5]
-            msr	SPMACCESSR_EL1, x1
-// CHECK:   msr	SPMACCESSR_EL1, x1              // encoding: [0x61,0x9d,0x10,0xd5]
-            mrs	x3, SPMACCESSR_EL12
-// CHECK:   mrs	x3, SPMACCESSR_EL12             // encoding: [0x63,0x9d,0x35,0xd5]
-            msr	SPMACCESSR_EL12, x1
-// CHECK:   msr	SPMACCESSR_EL12, x1             // encoding: [0x61,0x9d,0x15,0xd5]
-            mrs	x3, SPMACCESSR_EL2
-// CHECK:   mrs	x3, SPMACCESSR_EL2              // encoding: [0x63,0x9d,0x34,0xd5]
-            msr	SPMACCESSR_EL2, x1
-// CHECK:   msr	SPMACCESSR_EL2, x1              // encoding: [0x61,0x9d,0x14,0xd5]
-            mrs	x3, SPMACCESSR_EL3
-// CHECK:   mrs	x3, SPMACCESSR_EL3              // encoding: [0x63,0x9d,0x36,0xd5]
-            msr	SPMACCESSR_EL3, x1
-// CHECK:   msr	SPMACCESSR_EL3, x1              // encoding: [0x61,0x9d,0x16,0xd5]
-            mrs	x3, SPMCNTENCLR_EL0
-// CHECK:   mrs	x3, SPMCNTENCLR_EL0             // encoding: [0x43,0x9c,0x33,0xd5]
-            msr	SPMCNTENCLR_EL0, x1
-// CHECK:   msr	SPMCNTENCLR_EL0, x1             // encoding: [0x41,0x9c,0x13,0xd5]
-            mrs	x3, SPMCNTENSET_EL0
-// CHECK:   mrs	x3, SPMCNTENSET_EL0             // encoding: [0x23,0x9c,0x33,0xd5]
-            msr	SPMCNTENSET_EL0, x1
-// CHECK:   msr	SPMCNTENSET_EL0, x1             // encoding: [0x21,0x9c,0x13,0xd5]
-            mrs	x3, SPMCR_EL0
-// CHECK:   mrs	x3, SPMCR_EL0                   // encoding: [0x03,0x9c,0x33,0xd5]
-            msr	SPMCR_EL0, x1
-// CHECK:   msr	SPMCR_EL0, x1                   // encoding: [0x01,0x9c,0x13,0xd5]
-            mrs	x3, SPMDEVAFF_EL1
-// CHECK:   mrs	x3, SPMDEVAFF_EL1               // encoding: [0xc3,0x9d,0x30,0xd5]
-            mrs	x3, SPMDEVARCH_EL1
-// CHECK:   mrs	x3, SPMDEVARCH_EL1              // encoding: [0xa3,0x9d,0x30,0xd5]
-
-            mrs	x3, SPMEVCNTR0_EL0
-// CHECK:   mrs	x3, SPMEVCNTR0_EL0              // encoding: [0x03,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR0_EL0, x1
-// CHECK:   msr	SPMEVCNTR0_EL0, x1              // encoding: [0x01,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR1_EL0
-// CHECK:   mrs	x3, SPMEVCNTR1_EL0              // encoding: [0x23,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR1_EL0, x1
-// CHECK:   msr	SPMEVCNTR1_EL0, x1              // encoding: [0x21,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR2_EL0
-// CHECK:   mrs	x3, SPMEVCNTR2_EL0              // encoding: [0x43,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR2_EL0, x1
-// CHECK:   msr	SPMEVCNTR2_EL0, x1              // encoding: [0x41,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR3_EL0
-// CHECK:   mrs	x3, SPMEVCNTR3_EL0              // encoding: [0x63,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR3_EL0, x1
-// CHECK:   msr	SPMEVCNTR3_EL0, x1              // encoding: [0x61,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR4_EL0
-// CHECK:   mrs	x3, SPMEVCNTR4_EL0              // encoding: [0x83,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR4_EL0, x1
-// CHECK:   msr	SPMEVCNTR4_EL0, x1              // encoding: [0x81,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR5_EL0
-// CHECK:   mrs	x3, SPMEVCNTR5_EL0              // encoding: [0xa3,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR5_EL0, x1
-// CHECK:   msr	SPMEVCNTR5_EL0, x1              // encoding: [0xa1,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR6_EL0
-// CHECK:   mrs	x3, SPMEVCNTR6_EL0              // encoding: [0xc3,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR6_EL0, x1
-// CHECK:   msr	SPMEVCNTR6_EL0, x1              // encoding: [0xc1,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR7_EL0
-// CHECK:   mrs	x3, SPMEVCNTR7_EL0              // encoding: [0xe3,0xe0,0x33,0xd5]
-            msr	SPMEVCNTR7_EL0, x1
-// CHECK:   msr	SPMEVCNTR7_EL0, x1              // encoding: [0xe1,0xe0,0x13,0xd5]
-            mrs	x3, SPMEVCNTR8_EL0
-// CHECK:   mrs	x3, SPMEVCNTR8_EL0              // encoding: [0x03,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR8_EL0, x1
-// CHECK:   msr	SPMEVCNTR8_EL0, x1              // encoding: [0x01,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR9_EL0
-// CHECK:   mrs	x3, SPMEVCNTR9_EL0              // encoding: [0x23,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR9_EL0, x1
-// CHECK:   msr	SPMEVCNTR9_EL0, x1              // encoding: [0x21,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR10_EL0
-// CHECK:   mrs	x3, SPMEVCNTR10_EL0             // encoding: [0x43,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR10_EL0, x1
-// CHECK:   msr	SPMEVCNTR10_EL0, x1             // encoding: [0x41,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR11_EL0
-// CHECK:   mrs	x3, SPMEVCNTR11_EL0             // encoding: [0x63,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR11_EL0, x1
-// CHECK:   msr	SPMEVCNTR11_EL0, x1             // encoding: [0x61,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR12_EL0
-// CHECK:   mrs	x3, SPMEVCNTR12_EL0             // encoding: [0x83,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR12_EL0, x1
-// CHECK:   msr	SPMEVCNTR12_EL0, x1             // encoding: [0x81,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR13_EL0
-// CHECK:   mrs	x3, SPMEVCNTR13_EL0             // encoding: [0xa3,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR13_EL0, x1
-// CHECK:   msr	SPMEVCNTR13_EL0, x1             // encoding: [0xa1,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR14_EL0
-// CHECK:   mrs	x3, SPMEVCNTR14_EL0             // encoding: [0xc3,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR14_EL0, x1
-// CHECK:   msr	SPMEVCNTR14_EL0, x1             // encoding: [0xc1,0xe1,0x13,0xd5]
-            mrs	x3, SPMEVCNTR15_EL0
-// CHECK:   mrs	x3, SPMEVCNTR15_EL0             // encoding: [0xe3,0xe1,0x33,0xd5]
-            msr	SPMEVCNTR15_EL0, x1
-// CHECK:   msr	SPMEVCNTR15_EL0, x1             // encoding: [0xe1,0xe1,0x13,0xd5]
-
-            mrs	x3, SPMEVFILT2R0_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R0_EL0            // encoding: [0x03,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R0_EL0, x1
-// CHECK:   msr	SPMEVFILT2R0_EL0, x1            // encoding: [0x01,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R1_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R1_EL0            // encoding: [0x23,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R1_EL0, x1
-// CHECK:   msr	SPMEVFILT2R1_EL0, x1            // encoding: [0x21,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R2_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R2_EL0            // encoding: [0x43,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R2_EL0, x1
-// CHECK:   msr	SPMEVFILT2R2_EL0, x1            // encoding: [0x41,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R3_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R3_EL0            // encoding: [0x63,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R3_EL0, x1
-// CHECK:   msr	SPMEVFILT2R3_EL0, x1            // encoding: [0x61,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R4_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R4_EL0            // encoding: [0x83,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R4_EL0, x1
-// CHECK:   msr	SPMEVFILT2R4_EL0, x1            // encoding: [0x81,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R5_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R5_EL0            // encoding: [0xa3,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R5_EL0, x1
-// CHECK:   msr	SPMEVFILT2R5_EL0, x1            // encoding: [0xa1,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R6_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R6_EL0            // encoding: [0xc3,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R6_EL0, x1
-// CHECK:   msr	SPMEVFILT2R6_EL0, x1            // encoding: [0xc1,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R7_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R7_EL0            // encoding: [0xe3,0xe6,0x33,0xd5]
-            msr	SPMEVFILT2R7_EL0, x1
-// CHECK:   msr	SPMEVFILT2R7_EL0, x1            // encoding: [0xe1,0xe6,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R8_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R8_EL0            // encoding: [0x03,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R8_EL0, x1
-// CHECK:   msr	SPMEVFILT2R8_EL0, x1            // encoding: [0x01,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R9_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R9_EL0            // encoding: [0x23,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R9_EL0, x1
-// CHECK:   msr	SPMEVFILT2R9_EL0, x1            // encoding: [0x21,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R10_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R10_EL0           // encoding: [0x43,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R10_EL0, x1
-// CHECK:   msr	SPMEVFILT2R10_EL0, x1           // encoding: [0x41,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R11_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R11_EL0           // encoding: [0x63,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R11_EL0, x1
-// CHECK:   msr	SPMEVFILT2R11_EL0, x1           // encoding: [0x61,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R12_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R12_EL0           // encoding: [0x83,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R12_EL0, x1
-// CHECK:   msr	SPMEVFILT2R12_EL0, x1           // encoding: [0x81,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R13_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R13_EL0           // encoding: [0xa3,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R13_EL0, x1
-// CHECK:   msr	SPMEVFILT2R13_EL0, x1           // encoding: [0xa1,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R14_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R14_EL0           // encoding: [0xc3,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R14_EL0, x1
-// CHECK:   msr	SPMEVFILT2R14_EL0, x1           // encoding: [0xc1,0xe7,0x13,0xd5]
-            mrs	x3, SPMEVFILT2R15_EL0
-// CHECK:   mrs	x3, SPMEVFILT2R15_EL0           // encoding: [0xe3,0xe7,0x33,0xd5]
-            msr	SPMEVFILT2R15_EL0, x1
-// CHECK:   msr	SPMEVFILT2R15_EL0, x1           // encoding: [0xe1,0xe7,0x13,0xd5]
-
-            mrs	x3, SPMEVFILTR0_EL0
-// CHECK:   mrs	x3, SPMEVFILTR0_EL0             // encoding: [0x03,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR0_EL0, x1
-// CHECK:   msr	SPMEVFILTR0_EL0, x1             // encoding: [0x01,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR1_EL0
-// CHECK:   mrs	x3, SPMEVFILTR1_EL0             // encoding: [0x23,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR1_EL0, x1
-// CHECK:   msr	SPMEVFILTR1_EL0, x1             // encoding: [0x21,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR2_EL0
-// CHECK:   mrs	x3, SPMEVFILTR2_EL0             // encoding: [0x43,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR2_EL0, x1
-// CHECK:   msr	SPMEVFILTR2_EL0, x1             // encoding: [0x41,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR3_EL0
-// CHECK:   mrs	x3, SPMEVFILTR3_EL0             // encoding: [0x63,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR3_EL0, x1
-// CHECK:   msr	SPMEVFILTR3_EL0, x1             // encoding: [0x61,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR4_EL0
-// CHECK:   mrs	x3, SPMEVFILTR4_EL0             // encoding: [0x83,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR4_EL0, x1
-// CHECK:   msr	SPMEVFILTR4_EL0, x1             // encoding: [0x81,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR5_EL0
-// CHECK:   mrs	x3, SPMEVFILTR5_EL0             // encoding: [0xa3,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR5_EL0, x1
-// CHECK:   msr	SPMEVFILTR5_EL0, x1             // encoding: [0xa1,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR6_EL0
-// CHECK:   mrs	x3, SPMEVFILTR6_EL0             // encoding: [0xc3,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR6_EL0, x1
-// CHECK:   msr	SPMEVFILTR6_EL0, x1             // encoding: [0xc1,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR7_EL0
-// CHECK:   mrs	x3, SPMEVFILTR7_EL0             // encoding: [0xe3,0xe4,0x33,0xd5]
-            msr	SPMEVFILTR7_EL0, x1
-// CHECK:   msr	SPMEVFILTR7_EL0, x1             // encoding: [0xe1,0xe4,0x13,0xd5]
-            mrs	x3, SPMEVFILTR8_EL0
-// CHECK:   mrs	x3, SPMEVFILTR8_EL0             // encoding: [0x03,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR8_EL0, x1
-// CHECK:   msr	SPMEVFILTR8_EL0, x1             // encoding: [0x01,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR9_EL0
-// CHECK:   mrs	x3, SPMEVFILTR9_EL0             // encoding: [0x23,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR9_EL0, x1
-// CHECK:   msr	SPMEVFILTR9_EL0, x1             // encoding: [0x21,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR10_EL0
-// CHECK:   mrs	x3, SPMEVFILTR10_EL0            // encoding: [0x43,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR10_EL0, x1
-// CHECK:   msr	SPMEVFILTR10_EL0, x1            // encoding: [0x41,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR11_EL0
-// CHECK:   mrs	x3, SPMEVFILTR11_EL0            // encoding: [0x63,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR11_EL0, x1
-// CHECK:   msr	SPMEVFILTR11_EL0, x1            // encoding: [0x61,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR12_EL0
-// CHECK:   mrs	x3, SPMEVFILTR12_EL0            // encoding: [0x83,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR12_EL0, x1
-// CHECK:   msr	SPMEVFILTR12_EL0, x1            // encoding: [0x81,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR13_EL0
-// CHECK:   mrs	x3, SPMEVFILTR13_EL0            // encoding: [0xa3,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR13_EL0, x1
-// CHECK:   msr	SPMEVFILTR13_EL0, x1            // encoding: [0xa1,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR14_EL0
-// CHECK:   mrs	x3, SPMEVFILTR14_EL0            // encoding: [0xc3,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR14_EL0, x1
-// CHECK:   msr	SPMEVFILTR14_EL0, x1            // encoding: [0xc1,0xe5,0x13,0xd5]
-            mrs	x3, SPMEVFILTR15_EL0
-// CHECK:   mrs	x3, SPMEVFILTR15_EL0            // encoding: [0xe3,0xe5,0x33,0xd5]
-            msr	SPMEVFILTR15_EL0, x1
-// CHECK:   msr	SPMEVFILTR15_EL0, x1            // encoding: [0xe1,0xe5,0x13,0xd5]
-
-            mrs	x3, SPMEVTYPER0_EL0
-// CHECK:   mrs	x3, SPMEVTYPER0_EL0             // encoding: [0x03,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER0_EL0, x1
-// CHECK:   msr	SPMEVTYPER0_EL0, x1             // encoding: [0x01,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER1_EL0
-// CHECK:   mrs	x3, SPMEVTYPER1_EL0             // encoding: [0x23,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER1_EL0, x1
-// CHECK:   msr	SPMEVTYPER1_EL0, x1             // encoding: [0x21,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER2_EL0
-// CHECK:   mrs	x3, SPMEVTYPER2_EL0             // encoding: [0x43,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER2_EL0, x1
-// CHECK:   msr	SPMEVTYPER2_EL0, x1             // encoding: [0x41,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER3_EL0
-// CHECK:   mrs	x3, SPMEVTYPER3_EL0             // encoding: [0x63,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER3_EL0, x1
-// CHECK:   msr	SPMEVTYPER3_EL0, x1             // encoding: [0x61,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER4_EL0
-// CHECK:   mrs	x3, SPMEVTYPER4_EL0             // encoding: [0x83,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER4_EL0, x1
-// CHECK:   msr	SPMEVTYPER4_EL0, x1             // encoding: [0x81,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER5_EL0
-// CHECK:   mrs	x3, SPMEVTYPER5_EL0             // encoding: [0xa3,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER5_EL0, x1
-// CHECK:   msr	SPMEVTYPER5_EL0, x1             // encoding: [0xa1,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER6_EL0
-// CHECK:   mrs	x3, SPMEVTYPER6_EL0             // encoding: [0xc3,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER6_EL0, x1
-// CHECK:   msr	SPMEVTYPER6_EL0, x1             // encoding: [0xc1,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER7_EL0
-// CHECK:   mrs	x3, SPMEVTYPER7_EL0             // encoding: [0xe3,0xe2,0x33,0xd5]
-            msr	SPMEVTYPER7_EL0, x1
-// CHECK:   msr	SPMEVTYPER7_EL0, x1             // encoding: [0xe1,0xe2,0x13,0xd5]
-            mrs	x3, SPMEVTYPER8_EL0
-// CHECK:   mrs	x3, SPMEVTYPER8_EL0             // encoding: [0x03,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER8_EL0, x1
-// CHECK:   msr	SPMEVTYPER8_EL0, x1             // encoding: [0x01,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER9_EL0
-// CHECK:   mrs	x3, SPMEVTYPER9_EL0             // encoding: [0x23,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER9_EL0, x1
-// CHECK:   msr	SPMEVTYPER9_EL0, x1             // encoding: [0x21,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER10_EL0
-// CHECK:   mrs	x3, SPMEVTYPER10_EL0            // encoding: [0x43,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER10_EL0, x1
-// CHECK:   msr	SPMEVTYPER10_EL0, x1            // encoding: [0x41,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER11_EL0
-// CHECK:   mrs	x3, SPMEVTYPER11_EL0            // encoding: [0x63,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER11_EL0, x1
-// CHECK:   msr	SPMEVTYPER11_EL0, x1            // encoding: [0x61,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER12_EL0
-// CHECK:   mrs	x3, SPMEVTYPER12_EL0            // encoding: [0x83,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER12_EL0, x1
-// CHECK:   msr	SPMEVTYPER12_EL0, x1            // encoding: [0x81,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER13_EL0
-// CHECK:   mrs	x3, SPMEVTYPER13_EL0            // encoding: [0xa3,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER13_EL0, x1
-// CHECK:   msr	SPMEVTYPER13_EL0, x1            // encoding: [0xa1,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER14_EL0
-// CHECK:   mrs	x3, SPMEVTYPER14_EL0            // encoding: [0xc3,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER14_EL0, x1
-// CHECK:   msr	SPMEVTYPER14_EL0, x1            // encoding: [0xc1,0xe3,0x13,0xd5]
-            mrs	x3, SPMEVTYPER15_EL0
-// CHECK:   mrs	x3, SPMEVTYPER15_EL0            // encoding: [0xe3,0xe3,0x33,0xd5]
-            msr	SPMEVTYPER15_EL0, x1
-// CHECK:   msr	SPMEVTYPER15_EL0, x1            // encoding: [0xe1,0xe3,0x13,0xd5]
-
-            mrs	x3, SPMIIDR_EL1
-// CHECK:   mrs	x3, SPMIIDR_EL1                 // encoding: [0x83,0x9d,0x30,0xd5]
-            mrs	x3, SPMINTENCLR_EL1
-// CHECK:   mrs	x3, SPMINTENCLR_EL1             // encoding: [0x43,0x9e,0x30,0xd5]
-            msr	SPMINTENCLR_EL1, x1
-// CHECK:   msr	SPMINTENCLR_EL1, x1             // encoding: [0x41,0x9e,0x10,0xd5]
-            mrs	x3, SPMINTENSET_EL1
-// CHECK:   mrs	x3, SPMINTENSET_EL1             // encoding: [0x23,0x9e,0x30,0xd5]
-            msr	SPMINTENSET_EL1, x1
-// CHECK:   msr	SPMINTENSET_EL1, x1             // encoding: [0x21,0x9e,0x10,0xd5]
-            mrs	x3, SPMOVSCLR_EL0
-// CHECK:   mrs	x3, SPMOVSCLR_EL0               // encoding: [0x63,0x9c,0x33,0xd5]
-            msr	SPMOVSCLR_EL0, x1
-// CHECK:   msr	SPMOVSCLR_EL0, x1               // encoding: [0x61,0x9c,0x13,0xd5]
-            mrs	x3, SPMOVSSET_EL0
-// CHECK:   mrs	x3, SPMOVSSET_EL0               // encoding: [0x63,0x9e,0x33,0xd5]
-            msr	SPMOVSSET_EL0, x1
-// CHECK:   msr	SPMOVSSET_EL0, x1               // encoding: [0x61,0x9e,0x13,0xd5]
-            mrs	x3, SPMSELR_EL0
-// CHECK:   mrs	x3, SPMSELR_EL0                 // encoding: [0xa3,0x9c,0x33,0xd5]
-            msr	SPMSELR_EL0, x1
-// CHECK:   msr	SPMSELR_EL0, x1                 // encoding: [0xa1,0x9c,0x13,0xd5]
-            mrs x3, SPMCGCR0_EL1
-// CHECK:   mrs x3, SPMCGCR0_EL1                // encoding: [0x03,0x9d,0x30,0xd5]
-            mrs x3, SPMCGCR1_EL1
-// CHECK:   mrs x3, SPMCGCR1_EL1                // encoding: [0x23,0x9d,0x30,0xd5]
-            mrs x3, SPMCFGR_EL1
-// CHECK:   mrs x3, SPMCFGR_EL1                 // encoding: [0xe3,0x9d,0x30,0xd5]
-            mrs x3, SPMROOTCR_EL3
-// CHECK:   mrs x3, SPMROOTCR_EL3               // encoding: [0xe3,0x9e,0x36,0xd5]
-            msr SPMROOTCR_EL3, x3
-// CHECK:   msr SPMROOTCR_EL3, x3               // encoding: [0xe3,0x9e,0x16,0xd5]
-            mrs x3, SPMSCR_EL1
-// CHECK:   mrs x3, SPMSCR_EL1                  // encoding: [0xe3,0x9e,0x37,0xd5]
-            msr SPMSCR_EL1, x3
-// CHECK:   msr SPMSCR_EL1, x3                  // encoding: [0xe3,0x9e,0x17,0xd5]
+mrs x3, SPMACCESSR_EL1
+// CHECK-INST: mrs x3, SPMACCESSR_EL1
+// CHECK-ENCODING: encoding: [0x63,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309d63      mrs x3, SPMACCESSR_EL1
+
+msr SPMACCESSR_EL1, x1
+// CHECK-INST: msr SPMACCESSR_EL1, x1
+// CHECK-ENCODING: encoding: [0x61,0x9d,0x10,0xd5]
+// CHECK-UNKNOWN:  d5109d61      msr SPMACCESSR_EL1, x1
+
+mrs x3, SPMACCESSR_EL12
+// CHECK-INST: mrs x3, SPMACCESSR_EL12
+// CHECK-ENCODING: encoding: [0x63,0x9d,0x35,0xd5]
+// CHECK-UNKNOWN:  d5359d63      mrs x3, SPMACCESSR_EL12
+
+msr SPMACCESSR_EL12, x1
+// CHECK-INST: msr SPMACCESSR_EL12, x1
+// CHECK-ENCODING: encoding: [0x61,0x9d,0x15,0xd5]
+// CHECK-UNKNOWN:  d5159d61      msr SPMACCESSR_EL12, x1
+
+mrs x3, SPMACCESSR_EL2
+// CHECK-INST: mrs x3, SPMACCESSR_EL2
+// CHECK-ENCODING: encoding: [0x63,0x9d,0x34,0xd5]
+// CHECK-UNKNOWN:  d5349d63      mrs x3, SPMACCESSR_EL2
+
+msr SPMACCESSR_EL2, x1
+// CHECK-INST: msr SPMACCESSR_EL2, x1
+// CHECK-ENCODING: encoding: [0x61,0x9d,0x14,0xd5]
+// CHECK-UNKNOWN:  d5149d61      msr SPMACCESSR_EL2, x1
+
+mrs x3, SPMACCESSR_EL3
+// CHECK-INST: mrs x3, SPMACCESSR_EL3
+// CHECK-ENCODING: encoding: [0x63,0x9d,0x36,0xd5]
+// CHECK-UNKNOWN:  d5369d63      mrs x3, SPMACCESSR_EL3
+
+msr SPMACCESSR_EL3, x1
+// CHECK-INST: msr SPMACCESSR_EL3, x1
+// CHECK-ENCODING: encoding: [0x61,0x9d,0x16,0xd5]
+// CHECK-UNKNOWN:  d5169d61      msr SPMACCESSR_EL3, x1
+
+mrs x3, SPMCNTENCLR_EL0
+// CHECK-INST: mrs x3, SPMCNTENCLR_EL0
+// CHECK-ENCODING: encoding: [0x43,0x9c,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339c43      mrs x3, SPMCNTENCLR_EL0
+
+msr SPMCNTENCLR_EL0, x1
+// CHECK-INST: msr SPMCNTENCLR_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139c41      msr SPMCNTENCLR_EL0, x1
+
+mrs x3, SPMCNTENSET_EL0
+// CHECK-INST: mrs x3, SPMCNTENSET_EL0
+// CHECK-ENCODING: encoding: [0x23,0x9c,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339c23      mrs x3, SPMCNTENSET_EL0
+
+msr SPMCNTENSET_EL0, x1
+// CHECK-INST: msr SPMCNTENSET_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139c21      msr SPMCNTENSET_EL0, x1
+
+mrs x3, SPMCR_EL0
+// CHECK-INST: mrs x3, SPMCR_EL0
+// CHECK-ENCODING: encoding: [0x03,0x9c,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339c03      mrs x3, SPMCR_EL0
+
+msr SPMCR_EL0, x1
+// CHECK-INST: msr SPMCR_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139c01      msr SPMCR_EL0, x1
+
+mrs x3, SPMDEVAFF_EL1
+// CHECK-INST: mrs x3, SPMDEVAFF_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309dc3      mrs x3, SPMDEVAFF_EL1
+
+mrs x3, SPMDEVARCH_EL1
+// CHECK-INST: mrs x3, SPMDEVARCH_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309da3      mrs x3, SPMDEVARCH_EL1
+
+mrs x3, SPMEVCNTR0_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR0_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e003      mrs x3, SPMEVCNTR0_EL0
+
+msr SPMEVCNTR0_EL0, x1
+// CHECK-INST: msr SPMEVCNTR0_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e001      msr SPMEVCNTR0_EL0, x1
+
+mrs x3, SPMEVCNTR1_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR1_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e023      mrs x3, SPMEVCNTR1_EL0
+
+msr SPMEVCNTR1_EL0, x1
+// CHECK-INST: msr SPMEVCNTR1_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e021      msr SPMEVCNTR1_EL0, x1
+
+mrs x3, SPMEVCNTR2_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR2_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e043      mrs x3, SPMEVCNTR2_EL0
+
+msr SPMEVCNTR2_EL0, x1
+// CHECK-INST: msr SPMEVCNTR2_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e041      msr SPMEVCNTR2_EL0, x1
+
+mrs x3, SPMEVCNTR3_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR3_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e063      mrs x3, SPMEVCNTR3_EL0
+
+msr SPMEVCNTR3_EL0, x1
+// CHECK-INST: msr SPMEVCNTR3_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e061      msr SPMEVCNTR3_EL0, x1
+
+mrs x3, SPMEVCNTR4_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR4_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e083      mrs x3, SPMEVCNTR4_EL0
+
+msr SPMEVCNTR4_EL0, x1
+// CHECK-INST: msr SPMEVCNTR4_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e081      msr SPMEVCNTR4_EL0, x1
+
+mrs x3, SPMEVCNTR5_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR5_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e0a3      mrs x3, SPMEVCNTR5_EL0
+
+msr SPMEVCNTR5_EL0, x1
+// CHECK-INST: msr SPMEVCNTR5_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e0a1      msr SPMEVCNTR5_EL0, x1
+
+mrs x3, SPMEVCNTR6_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR6_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e0c3      mrs x3, SPMEVCNTR6_EL0
+
+msr SPMEVCNTR6_EL0, x1
+// CHECK-INST: msr SPMEVCNTR6_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e0c1      msr SPMEVCNTR6_EL0, x1
+
+mrs x3, SPMEVCNTR7_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR7_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe0,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e0e3      mrs x3, SPMEVCNTR7_EL0
+
+msr SPMEVCNTR7_EL0, x1
+// CHECK-INST: msr SPMEVCNTR7_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe0,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e0e1      msr SPMEVCNTR7_EL0, x1
+
+mrs x3, SPMEVCNTR8_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR8_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e103      mrs x3, SPMEVCNTR8_EL0
+
+msr SPMEVCNTR8_EL0, x1
+// CHECK-INST: msr SPMEVCNTR8_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e101      msr SPMEVCNTR8_EL0, x1
+
+mrs x3, SPMEVCNTR9_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR9_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e123      mrs x3, SPMEVCNTR9_EL0
+
+msr SPMEVCNTR9_EL0, x1
+// CHECK-INST: msr SPMEVCNTR9_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e121      msr SPMEVCNTR9_EL0, x1
+
+mrs x3, SPMEVCNTR10_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR10_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e143      mrs x3, SPMEVCNTR10_EL0
+
+msr SPMEVCNTR10_EL0, x1
+// CHECK-INST: msr SPMEVCNTR10_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e141      msr SPMEVCNTR10_EL0, x1
+
+mrs x3, SPMEVCNTR11_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR11_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e163      mrs x3, SPMEVCNTR11_EL0
+
+msr SPMEVCNTR11_EL0, x1
+// CHECK-INST: msr SPMEVCNTR11_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e161      msr SPMEVCNTR11_EL0, x1
+
+mrs x3, SPMEVCNTR12_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR12_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e183      mrs x3, SPMEVCNTR12_EL0
+
+msr SPMEVCNTR12_EL0, x1
+// CHECK-INST: msr SPMEVCNTR12_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e181      msr SPMEVCNTR12_EL0, x1
+
+mrs x3, SPMEVCNTR13_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR13_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e1a3      mrs x3, SPMEVCNTR13_EL0
+
+msr SPMEVCNTR13_EL0, x1
+// CHECK-INST: msr SPMEVCNTR13_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e1a1      msr SPMEVCNTR13_EL0, x1
+
+mrs x3, SPMEVCNTR14_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR14_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e1c3      mrs x3, SPMEVCNTR14_EL0
+
+msr SPMEVCNTR14_EL0, x1
+// CHECK-INST: msr SPMEVCNTR14_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e1c1      msr SPMEVCNTR14_EL0, x1
+
+mrs x3, SPMEVCNTR15_EL0
+// CHECK-INST: mrs x3, SPMEVCNTR15_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe1,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e1e3      mrs x3, SPMEVCNTR15_EL0
+
+msr SPMEVCNTR15_EL0, x1
+// CHECK-INST: msr SPMEVCNTR15_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe1,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e1e1      msr SPMEVCNTR15_EL0, x1
+
+mrs x3, SPMEVFILT2R0_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R0_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e603      mrs x3, SPMEVFILT2R0_EL0
+
+msr SPMEVFILT2R0_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R0_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e601      msr SPMEVFILT2R0_EL0, x1
+
+mrs x3, SPMEVFILT2R1_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R1_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e623      mrs x3, SPMEVFILT2R1_EL0
+
+msr SPMEVFILT2R1_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R1_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e621      msr SPMEVFILT2R1_EL0, x1
+
+mrs x3, SPMEVFILT2R2_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R2_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e643      mrs x3, SPMEVFILT2R2_EL0
+
+msr SPMEVFILT2R2_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R2_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e641      msr SPMEVFILT2R2_EL0, x1
+
+mrs x3, SPMEVFILT2R3_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R3_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e663      mrs x3, SPMEVFILT2R3_EL0
+
+msr SPMEVFILT2R3_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R3_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e661      msr SPMEVFILT2R3_EL0, x1
+
+mrs x3, SPMEVFILT2R4_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R4_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e683      mrs x3, SPMEVFILT2R4_EL0
+
+msr SPMEVFILT2R4_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R4_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e681      msr SPMEVFILT2R4_EL0, x1
+
+mrs x3, SPMEVFILT2R5_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R5_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e6a3      mrs x3, SPMEVFILT2R5_EL0
+
+msr SPMEVFILT2R5_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R5_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e6a1      msr SPMEVFILT2R5_EL0, x1
+
+mrs x3, SPMEVFILT2R6_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R6_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e6c3      mrs x3, SPMEVFILT2R6_EL0
+
+msr SPMEVFILT2R6_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R6_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e6c1      msr SPMEVFILT2R6_EL0, x1
+
+mrs x3, SPMEVFILT2R7_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R7_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe6,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e6e3      mrs x3, SPMEVFILT2R7_EL0
+
+msr SPMEVFILT2R7_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R7_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe6,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e6e1      msr SPMEVFILT2R7_EL0, x1
+
+mrs x3, SPMEVFILT2R8_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R8_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e703      mrs x3, SPMEVFILT2R8_EL0
+
+msr SPMEVFILT2R8_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R8_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e701      msr SPMEVFILT2R8_EL0, x1
+
+mrs x3, SPMEVFILT2R9_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R9_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e723      mrs x3, SPMEVFILT2R9_EL0
+
+msr SPMEVFILT2R9_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R9_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e721      msr SPMEVFILT2R9_EL0, x1
+
+mrs x3, SPMEVFILT2R10_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R10_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e743      mrs x3, SPMEVFILT2R10_EL0
+
+msr SPMEVFILT2R10_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R10_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e741      msr SPMEVFILT2R10_EL0, x1
+
+mrs x3, SPMEVFILT2R11_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R11_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e763      mrs x3, SPMEVFILT2R11_EL0
+
+msr SPMEVFILT2R11_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R11_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e761      msr SPMEVFILT2R11_EL0, x1
+
+mrs x3, SPMEVFILT2R12_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R12_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e783      mrs x3, SPMEVFILT2R12_EL0
+
+msr SPMEVFILT2R12_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R12_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e781      msr SPMEVFILT2R12_EL0, x1
+
+mrs x3, SPMEVFILT2R13_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R13_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e7a3      mrs x3, SPMEVFILT2R13_EL0
+
+msr SPMEVFILT2R13_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R13_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e7a1      msr SPMEVFILT2R13_EL0, x1
+
+mrs x3, SPMEVFILT2R14_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R14_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e7c3      mrs x3, SPMEVFILT2R14_EL0
+
+msr SPMEVFILT2R14_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R14_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e7c1      msr SPMEVFILT2R14_EL0, x1
+
+mrs x3, SPMEVFILT2R15_EL0
+// CHECK-INST: mrs x3, SPMEVFILT2R15_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe7,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e7e3      mrs x3, SPMEVFILT2R15_EL0
+
+msr SPMEVFILT2R15_EL0, x1
+// CHECK-INST: msr SPMEVFILT2R15_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe7,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e7e1      msr SPMEVFILT2R15_EL0, x1
+
+mrs x3, SPMEVFILTR0_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR0_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e403      mrs x3, SPMEVFILTR0_EL0
+
+msr SPMEVFILTR0_EL0, x1
+// CHECK-INST: msr SPMEVFILTR0_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e401      msr SPMEVFILTR0_EL0, x1
+
+mrs x3, SPMEVFILTR1_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR1_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e423      mrs x3, SPMEVFILTR1_EL0
+
+msr SPMEVFILTR1_EL0, x1
+// CHECK-INST: msr SPMEVFILTR1_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e421      msr SPMEVFILTR1_EL0, x1
+
+mrs x3, SPMEVFILTR2_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR2_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e443      mrs x3, SPMEVFILTR2_EL0
+
+msr SPMEVFILTR2_EL0, x1
+// CHECK-INST: msr SPMEVFILTR2_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e441      msr SPMEVFILTR2_EL0, x1
+
+mrs x3, SPMEVFILTR3_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR3_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e463      mrs x3, SPMEVFILTR3_EL0
+
+msr SPMEVFILTR3_EL0, x1
+// CHECK-INST: msr SPMEVFILTR3_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e461      msr SPMEVFILTR3_EL0, x1
+
+mrs x3, SPMEVFILTR4_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR4_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e483      mrs x3, SPMEVFILTR4_EL0
+
+msr SPMEVFILTR4_EL0, x1
+// CHECK-INST: msr SPMEVFILTR4_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e481      msr SPMEVFILTR4_EL0, x1
+
+mrs x3, SPMEVFILTR5_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR5_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e4a3      mrs x3, SPMEVFILTR5_EL0
+
+msr SPMEVFILTR5_EL0, x1
+// CHECK-INST: msr SPMEVFILTR5_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e4a1      msr SPMEVFILTR5_EL0, x1
+
+mrs x3, SPMEVFILTR6_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR6_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e4c3      mrs x3, SPMEVFILTR6_EL0
+
+msr SPMEVFILTR6_EL0, x1
+// CHECK-INST: msr SPMEVFILTR6_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e4c1      msr SPMEVFILTR6_EL0, x1
+
+mrs x3, SPMEVFILTR7_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR7_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe4,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e4e3      mrs x3, SPMEVFILTR7_EL0
+
+msr SPMEVFILTR7_EL0, x1
+// CHECK-INST: msr SPMEVFILTR7_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe4,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e4e1      msr SPMEVFILTR7_EL0, x1
+
+mrs x3, SPMEVFILTR8_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR8_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e503      mrs x3, SPMEVFILTR8_EL0
+
+msr SPMEVFILTR8_EL0, x1
+// CHECK-INST: msr SPMEVFILTR8_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e501      msr SPMEVFILTR8_EL0, x1
+
+mrs x3, SPMEVFILTR9_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR9_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e523      mrs x3, SPMEVFILTR9_EL0
+
+msr SPMEVFILTR9_EL0, x1
+// CHECK-INST: msr SPMEVFILTR9_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e521      msr SPMEVFILTR9_EL0, x1
+
+mrs x3, SPMEVFILTR10_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR10_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e543      mrs x3, SPMEVFILTR10_EL0
+
+msr SPMEVFILTR10_EL0, x1
+// CHECK-INST: msr SPMEVFILTR10_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e541      msr SPMEVFILTR10_EL0, x1
+
+mrs x3, SPMEVFILTR11_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR11_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e563      mrs x3, SPMEVFILTR11_EL0
+
+msr SPMEVFILTR11_EL0, x1
+// CHECK-INST: msr SPMEVFILTR11_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e561      msr SPMEVFILTR11_EL0, x1
+
+mrs x3, SPMEVFILTR12_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR12_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e583      mrs x3, SPMEVFILTR12_EL0
+
+msr SPMEVFILTR12_EL0, x1
+// CHECK-INST: msr SPMEVFILTR12_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e581      msr SPMEVFILTR12_EL0, x1
+
+mrs x3, SPMEVFILTR13_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR13_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e5a3      mrs x3, SPMEVFILTR13_EL0
+
+msr SPMEVFILTR13_EL0, x1
+// CHECK-INST: msr SPMEVFILTR13_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e5a1      msr SPMEVFILTR13_EL0, x1
+
+mrs x3, SPMEVFILTR14_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR14_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e5c3      mrs x3, SPMEVFILTR14_EL0
+
+msr SPMEVFILTR14_EL0, x1
+// CHECK-INST: msr SPMEVFILTR14_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e5c1      msr SPMEVFILTR14_EL0, x1
+
+mrs x3, SPMEVFILTR15_EL0
+// CHECK-INST: mrs x3, SPMEVFILTR15_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe5,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e5e3      mrs x3, SPMEVFILTR15_EL0
+
+msr SPMEVFILTR15_EL0, x1
+// CHECK-INST: msr SPMEVFILTR15_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe5,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e5e1      msr SPMEVFILTR15_EL0, x1
+
+mrs x3, SPMEVTYPER0_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER0_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e203      mrs x3, SPMEVTYPER0_EL0
+
+msr SPMEVTYPER0_EL0, x1
+// CHECK-INST: msr SPMEVTYPER0_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e201      msr SPMEVTYPER0_EL0, x1
+
+mrs x3, SPMEVTYPER1_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER1_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e223      mrs x3, SPMEVTYPER1_EL0
+
+msr SPMEVTYPER1_EL0, x1
+// CHECK-INST: msr SPMEVTYPER1_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e221      msr SPMEVTYPER1_EL0, x1
+
+mrs x3, SPMEVTYPER2_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER2_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e243      mrs x3, SPMEVTYPER2_EL0
+
+msr SPMEVTYPER2_EL0, x1
+// CHECK-INST: msr SPMEVTYPER2_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e241      msr SPMEVTYPER2_EL0, x1
+
+mrs x3, SPMEVTYPER3_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER3_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e263      mrs x3, SPMEVTYPER3_EL0
+
+msr SPMEVTYPER3_EL0, x1
+// CHECK-INST: msr SPMEVTYPER3_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e261      msr SPMEVTYPER3_EL0, x1
+
+mrs x3, SPMEVTYPER4_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER4_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e283      mrs x3, SPMEVTYPER4_EL0
+
+msr SPMEVTYPER4_EL0, x1
+// CHECK-INST: msr SPMEVTYPER4_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e281      msr SPMEVTYPER4_EL0, x1
+
+mrs x3, SPMEVTYPER5_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER5_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e2a3      mrs x3, SPMEVTYPER5_EL0
+
+msr SPMEVTYPER5_EL0, x1
+// CHECK-INST: msr SPMEVTYPER5_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e2a1      msr SPMEVTYPER5_EL0, x1
+
+mrs x3, SPMEVTYPER6_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER6_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e2c3      mrs x3, SPMEVTYPER6_EL0
+
+msr SPMEVTYPER6_EL0, x1
+// CHECK-INST: msr SPMEVTYPER6_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e2c1      msr SPMEVTYPER6_EL0, x1
+
+mrs x3, SPMEVTYPER7_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER7_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe2,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e2e3      mrs x3, SPMEVTYPER7_EL0
+
+msr SPMEVTYPER7_EL0, x1
+// CHECK-INST: msr SPMEVTYPER7_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe2,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e2e1      msr SPMEVTYPER7_EL0, x1
+
+mrs x3, SPMEVTYPER8_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER8_EL0
+// CHECK-ENCODING: encoding: [0x03,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e303      mrs x3, SPMEVTYPER8_EL0
+
+msr SPMEVTYPER8_EL0, x1
+// CHECK-INST: msr SPMEVTYPER8_EL0, x1
+// CHECK-ENCODING: encoding: [0x01,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e301      msr SPMEVTYPER8_EL0, x1
+
+mrs x3, SPMEVTYPER9_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER9_EL0
+// CHECK-ENCODING: encoding: [0x23,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e323      mrs x3, SPMEVTYPER9_EL0
+
+msr SPMEVTYPER9_EL0, x1
+// CHECK-INST: msr SPMEVTYPER9_EL0, x1
+// CHECK-ENCODING: encoding: [0x21,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e321      msr SPMEVTYPER9_EL0, x1
+
+mrs x3, SPMEVTYPER10_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER10_EL0
+// CHECK-ENCODING: encoding: [0x43,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e343      mrs x3, SPMEVTYPER10_EL0
+
+msr SPMEVTYPER10_EL0, x1
+// CHECK-INST: msr SPMEVTYPER10_EL0, x1
+// CHECK-ENCODING: encoding: [0x41,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e341      msr SPMEVTYPER10_EL0, x1
+
+mrs x3, SPMEVTYPER11_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER11_EL0
+// CHECK-ENCODING: encoding: [0x63,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e363      mrs x3, SPMEVTYPER11_EL0
+
+msr SPMEVTYPER11_EL0, x1
+// CHECK-INST: msr SPMEVTYPER11_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e361      msr SPMEVTYPER11_EL0, x1
+
+mrs x3, SPMEVTYPER12_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER12_EL0
+// CHECK-ENCODING: encoding: [0x83,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e383      mrs x3, SPMEVTYPER12_EL0
+
+msr SPMEVTYPER12_EL0, x1
+// CHECK-INST: msr SPMEVTYPER12_EL0, x1
+// CHECK-ENCODING: encoding: [0x81,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e381      msr SPMEVTYPER12_EL0, x1
+
+mrs x3, SPMEVTYPER13_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER13_EL0
+// CHECK-ENCODING: encoding: [0xa3,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e3a3      mrs x3, SPMEVTYPER13_EL0
+
+msr SPMEVTYPER13_EL0, x1
+// CHECK-INST: msr SPMEVTYPER13_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e3a1      msr SPMEVTYPER13_EL0, x1
+
+mrs x3, SPMEVTYPER14_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER14_EL0
+// CHECK-ENCODING: encoding: [0xc3,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e3c3      mrs x3, SPMEVTYPER14_EL0
+
+msr SPMEVTYPER14_EL0, x1
+// CHECK-INST: msr SPMEVTYPER14_EL0, x1
+// CHECK-ENCODING: encoding: [0xc1,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e3c1      msr SPMEVTYPER14_EL0, x1
+
+mrs x3, SPMEVTYPER15_EL0
+// CHECK-INST: mrs x3, SPMEVTYPER15_EL0
+// CHECK-ENCODING: encoding: [0xe3,0xe3,0x33,0xd5]
+// CHECK-UNKNOWN:  d533e3e3      mrs x3, SPMEVTYPER15_EL0
+
+msr SPMEVTYPER15_EL0, x1
+// CHECK-INST: msr SPMEVTYPER15_EL0, x1
+// CHECK-ENCODING: encoding: [0xe1,0xe3,0x13,0xd5]
+// CHECK-UNKNOWN:  d513e3e1      msr SPMEVTYPER15_EL0, x1
+
+mrs x3, SPMIIDR_EL1
+// CHECK-INST: mrs x3, SPMIIDR_EL1
+// CHECK-ENCODING: encoding: [0x83,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309d83      mrs x3, SPMIIDR_EL1
+
+mrs x3, SPMINTENCLR_EL1
+// CHECK-INST: mrs x3, SPMINTENCLR_EL1
+// CHECK-ENCODING: encoding: [0x43,0x9e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309e43      mrs x3, SPMINTENCLR_EL1
+
+msr SPMINTENCLR_EL1, x1
+// CHECK-INST: msr SPMINTENCLR_EL1, x1
+// CHECK-ENCODING: encoding: [0x41,0x9e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5109e41      msr SPMINTENCLR_EL1, x1
+
+mrs x3, SPMINTENSET_EL1
+// CHECK-INST: mrs x3, SPMINTENSET_EL1
+// CHECK-ENCODING: encoding: [0x23,0x9e,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309e23      mrs x3, SPMINTENSET_EL1
+
+msr SPMINTENSET_EL1, x1
+// CHECK-INST: msr SPMINTENSET_EL1, x1
+// CHECK-ENCODING: encoding: [0x21,0x9e,0x10,0xd5]
+// CHECK-UNKNOWN:  d5109e21      msr SPMINTENSET_EL1, x1
+
+mrs x3, SPMOVSCLR_EL0
+// CHECK-INST: mrs x3, SPMOVSCLR_EL0
+// CHECK-ENCODING: encoding: [0x63,0x9c,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339c63      mrs x3, SPMOVSCLR_EL0
+
+msr SPMOVSCLR_EL0, x1
+// CHECK-INST: msr SPMOVSCLR_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139c61      msr SPMOVSCLR_EL0, x1
+
+mrs x3, SPMOVSSET_EL0
+// CHECK-INST: mrs x3, SPMOVSSET_EL0
+// CHECK-ENCODING: encoding: [0x63,0x9e,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339e63      mrs x3, SPMOVSSET_EL0
+
+msr SPMOVSSET_EL0, x1
+// CHECK-INST: msr SPMOVSSET_EL0, x1
+// CHECK-ENCODING: encoding: [0x61,0x9e,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139e61      msr SPMOVSSET_EL0, x1
+
+mrs x3, SPMSELR_EL0
+// CHECK-INST: mrs x3, SPMSELR_EL0
+// CHECK-ENCODING: encoding: [0xa3,0x9c,0x33,0xd5]
+// CHECK-UNKNOWN:  d5339ca3      mrs x3, SPMSELR_EL0
+
+msr SPMSELR_EL0, x1
+// CHECK-INST: msr SPMSELR_EL0, x1
+// CHECK-ENCODING: encoding: [0xa1,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139ca1      msr SPMSELR_EL0, x1
+
+mrs x3, SPMCGCR0_EL1
+// CHECK-INST: mrs x3, SPMCGCR0_EL1
+// CHECK-ENCODING: encoding: [0x03,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309d03      mrs x3, SPMCGCR0_EL1
+
+mrs x3, SPMCGCR1_EL1
+// CHECK-INST: mrs x3, SPMCGCR1_EL1
+// CHECK-ENCODING: encoding: [0x23,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309d23      mrs x3, SPMCGCR1_EL1
+
+mrs x3, SPMCFGR_EL1
+// CHECK-INST: mrs x3, SPMCFGR_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x9d,0x30,0xd5]
+// CHECK-UNKNOWN:  d5309de3      mrs x3, SPMCFGR_EL1
+
+mrs x3, SPMROOTCR_EL3
+// CHECK-INST: mrs x3, SPMROOTCR_EL3
+// CHECK-ENCODING: encoding: [0xe3,0x9e,0x36,0xd5]
+// CHECK-UNKNOWN:  d5369ee3      mrs x3, SPMROOTCR_EL3
+
+msr SPMROOTCR_EL3, x3
+// CHECK-INST: msr SPMROOTCR_EL3, x3
+// CHECK-ENCODING: encoding: [0xe3,0x9e,0x16,0xd5]
+// CHECK-UNKNOWN:  d5169ee3      msr SPMROOTCR_EL3, x3
+
+mrs x3, SPMSCR_EL1
+// CHECK-INST: mrs x3, SPMSCR_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x9e,0x37,0xd5]
+// CHECK-UNKNOWN:  d5379ee3      mrs x3, SPMSCR_EL1
+
+msr SPMSCR_EL1, x3
+// CHECK-INST: msr SPMSCR_EL1, x3
+// CHECK-ENCODING: encoding: [0xe3,0x9e,0x17,0xd5]
+// CHECK-UNKNOWN:  d5179ee3      msr SPMSCR_EL1, x3
 
 // FEAT_ITE
-            mrs x3, TRCITEEDCR
-// CHECK:   mrs x3, TRCITEEDCR                  // encoding: [0x23,0x02,0x31,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:21: error: expected readable system register
-            msr TRCITEEDCR, x3
-// CHECK:   msr TRCITEEDCR, x3                  // encoding: [0x23,0x02,0x11,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:17: error: expected writable system register
-            mrs	x3, TRCITECR_EL1
-// CHECK:   mrs	x3, TRCITECR_EL1                // encoding: [0x63,0x12,0x38,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:21: error: expected readable system register
-            msr	TRCITECR_EL1, x1
-// CHECK:   msr	TRCITECR_EL1, x1                // encoding: [0x61,0x12,0x18,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:17: error: expected writable system register or pstate
-            mrs	x3, TRCITECR_EL12
-// CHECK:   mrs	x3, TRCITECR_EL12               // encoding: [0x63,0x12,0x3d,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:21: error: expected readable system register
-            msr	TRCITECR_EL12, x1
-// CHECK:   msr	TRCITECR_EL12, x1               // encoding: [0x61,0x12,0x1d,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:17: error: expected writable system register or pstate
-            mrs	x3, TRCITECR_EL2
-// CHECK:   mrs	x3, TRCITECR_EL2                // encoding: [0x63,0x12,0x3c,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:21: error: expected readable system register
-            msr	TRCITECR_EL2, x1
-// CHECK:   msr	TRCITECR_EL2, x1                // encoding: [0x61,0x12,0x1c,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:17: error: expected writable system register or pstate
-            trcit x1
-// CHECK:   trcit x1                            // encoding: [0xe1,0x72,0x0b,0xd5]
-// ERROR-NO-ITE: [[@LINE-2]]:13: error: instruction requires: ite
+mrs x3, TRCITEEDCR
+// CHECK-INST: mrs x3, TRCITEEDCR
+// CHECK-ENCODING: encoding: [0x23,0x02,0x31,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d5310223      mrs x3, S2_1_C0_C2_1
+
+msr TRCITEEDCR, x3
+// CHECK-INST: msr TRCITEEDCR, x3
+// CHECK-ENCODING: encoding: [0x23,0x02,0x11,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d5110223      msr S2_1_C0_C2_1, x3
+
+mrs x3, TRCITECR_EL1
+// CHECK-INST: mrs x3, TRCITECR_EL1
+// CHECK-ENCODING: encoding: [0x63,0x12,0x38,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d5381263      mrs x3, S3_0_C1_C2_3
+
+msr TRCITECR_EL1, x1
+// CHECK-INST: msr TRCITECR_EL1, x1
+// CHECK-ENCODING: encoding: [0x61,0x12,0x18,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d5181261      msr S3_0_C1_C2_3, x1
+
+mrs x3, TRCITECR_EL12
+// CHECK-INST: mrs x3, TRCITECR_EL12
+// CHECK-ENCODING: encoding: [0x63,0x12,0x3d,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53d1263      mrs x3, S3_5_C1_C2_3
+
+msr TRCITECR_EL12, x1
+// CHECK-INST: msr TRCITECR_EL12, x1
+// CHECK-ENCODING: encoding: [0x61,0x12,0x1d,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51d1261      msr S3_5_C1_C2_3, x1
+
+mrs x3, TRCITECR_EL2
+// CHECK-INST: mrs x3, TRCITECR_EL2
+// CHECK-ENCODING: encoding: [0x63,0x12,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53c1263      mrs x3, S3_4_C1_C2_3
+
+msr TRCITECR_EL2, x1
+// CHECK-INST: msr TRCITECR_EL2, x1
+// CHECK-ENCODING: encoding: [0x61,0x12,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51c1261      msr S3_4_C1_C2_3, x1
+
+trcit x1
+// CHECK-INST: trcit x1
+// CHECK-ENCODING: encoding: [0xe1,0x72,0x0b,0xd5]
+// CHECK-ERROR: error: instruction requires: ite
+// CHECK-UNKNOWN:  d50b72e1      sys #3, c7, c2, #7, x1
 
 // FEAT_SPE_FDS
-            mrs x3, PMSDSFR_EL1
-// CHECK:   mrs x3, PMSDSFR_EL1                 // encoding: [0x83,0x9a,0x38,0xd5]
-            msr PMSDSFR_EL1, x3
-// CHECK:   msr PMSDSFR_EL1, x3                 // encoding: [0x83,0x9a,0x18,0xd5]
+mrs x3, PMSDSFR_EL1
+// CHECK-INST: mrs x3, PMSDSFR_EL1
+// CHECK-ENCODING: encoding: [0x83,0x9a,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389a83      mrs x3, PMSDSFR_EL1
+
+msr PMSDSFR_EL1, x3
+// CHECK-INST: msr PMSDSFR_EL1, x3
+// CHECK-ENCODING: encoding: [0x83,0x9a,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189a83      msr PMSDSFR_EL1, x3
diff --git a/llvm/test/MC/AArch64/armv8.9a-lrcpc3.s b/llvm/test/MC/AArch64/armv8.9a-lrcpc3.s
index 263f200..4ccc800 100644
--- a/llvm/test/MC/AArch64/armv8.9a-lrcpc3.s
+++ b/llvm/test/MC/AArch64/armv8.9a-lrcpc3.s
@@ -1,143 +1,282 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding               -mattr=+rcpc3 < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.4a -mattr=+rcpc3 < %s | FileCheck %s
-
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu               < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.9a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v9.4a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-
-               stilp   w24, w0, [x16, #-8]!
-// CHECK:      stilp   w24, w0, [x16, #-8]!     // encoding: [0x18,0x0a,0x00,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   w24, w0, [x16,  -8]!
-// CHECK:      stilp   w24, w0, [x16, #-8]!     // encoding: [0x18,0x0a,0x00,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   x25, x1, [x17,  -16]!
-// CHECK:      stilp   x25, x1, [x17, #-16]!    // encoding: [0x39,0x0a,0x01,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   x25, x1, [x17, #-16]!
-// CHECK:      stilp   x25, x1, [x17, #-16]!    // encoding: [0x39,0x0a,0x01,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   w26, w2, [x18]
-// CHECK:      stilp   w26, w2, [x18]           // encoding: [0x5a,0x1a,0x02,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   w26, w2, [x18, #0]
-// CHECK:      stilp   w26, w2, [x18]           // encoding: [0x5a,0x1a,0x02,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   x27, x3, [sp]
-// CHECK:      stilp   x27, x3, [sp]            // encoding: [0xfb,0x1b,0x03,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stilp   x27, x3, [sp, 0]
-// CHECK:      stilp   x27, x3, [sp]            // encoding: [0xfb,0x1b,0x03,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w28, w4, [x20], #8
-// CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w28, w4, [x20, #0], #8
-// CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w28, w4, [x20],  8
-// CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w28, w4, [x20, 0],  8
-// CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  x29, x5, [x21], #16
-// CHECK:      ldiapp  x29, x5, [x21], #16      // encoding: [0xbd,0x0a,0x45,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  x29, x5, [x21],  16
-// CHECK:      ldiapp  x29, x5, [x21], #16      // encoding: [0xbd,0x0a,0x45,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w30, w6, [sp]
-// CHECK:      ldiapp  w30, w6, [sp]            // encoding: [0xfe,0x1b,0x46,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  w30, w6, [sp, #0]
-// CHECK:      ldiapp  w30, w6, [sp]            // encoding: [0xfe,0x1b,0x46,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  xzr, x7, [x23]
-// CHECK:      ldiapp  xzr, x7, [x23]           // encoding: [0xff,0x1a,0x47,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldiapp  xzr, x7, [x23, 0]
-// CHECK:      ldiapp  xzr, x7, [x23]           // encoding: [0xff,0x1a,0x47,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-
-               stlr w3, [x15, #-4]!
-// CHECK:      stlr w3, [x15, #-4]!    // encoding: [0xe3,0x09,0x80,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlr w3, [x15,  -4]!
-// CHECK:      stlr w3, [x15, #-4]!    // encoding: [0xe3,0x09,0x80,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlr x3, [x15, #-8]!
-// CHECK:      stlr x3, [x15, #-8]!    // encoding: [0xe3,0x09,0x80,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlr x3, [sp,  -8]!
-// CHECK:      stlr x3, [sp, #-8]!     // encoding: [0xe3,0x0b,0x80,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapr w3, [sp], #4
-// CHECK:      ldapr w3, [sp], #4       // encoding: [0xe3,0x0b,0xc0,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapr w3, [x15], 4
-// CHECK:      ldapr w3, [x15], #4      // encoding: [0xe3,0x09,0xc0,0x99]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapr x3, [x15], #8
-// CHECK:      ldapr x3, [x15], #8      // encoding: [0xe3,0x09,0xc0,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapr x3, [x15], 8
-// CHECK:      ldapr x3, [x15], #8      // encoding: [0xe3,0x09,0xc0,0xd9]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-
-               stlur b3, [x15, #-1]
-// CHECK:      stlur b3, [x15, #-1]  // encoding: [0xe3,0xf9,0x1f,0x1d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlur h3, [x15, #2]
-// CHECK:      stlur h3, [x15, #2]   // encoding: [0xe3,0x29,0x00,0x5d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlur s3, [x15, #-3]
-// CHECK:      stlur s3, [x15, #-3]  // encoding: [0xe3,0xd9,0x1f,0x9d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlur d3, [sp, #4]
-// CHECK:      stlur d3, [sp, #4]    // encoding: [0xe3,0x4b,0x00,0xdd]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stlur q3, [x15, #-5]
-// CHECK:      stlur q3, [x15, #-5]  // encoding: [0xe3,0xb9,0x9f,0x1d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapur b3, [x15, #6]
-// CHECK:      ldapur b3, [x15, #6]  // encoding: [0xe3,0x69,0x40,0x1d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapur h3, [x15, #-7]
-// CHECK:      ldapur h3, [x15, #-7] // encoding: [0xe3,0x99,0x5f,0x5d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapur s3, [x15, #8]
-// CHECK:      ldapur s3, [x15, #8]  // encoding: [0xe3,0x89,0x40,0x9d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapur d3, [x15, #-9]
-// CHECK:      ldapur d3, [x15, #-9] // encoding: [0xe3,0x79,0x5f,0xdd]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldapur q3, [sp, #10]
-// CHECK:      ldapur q3, [sp, #10]  // encoding: [0xe3,0xab,0xc0,0x1d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-
-               stl1  { v3.d }[0], [x15]
-// CHECK:      stl1  { v3.d }[0], [x15]     // encoding: [0xe3,0x85,0x01,0x0d]
-// ERROR-NO-RCPC3:  [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stl1  { v3.d }[0], [x15, #0]
-// CHECK:      stl1  { v3.d }[0], [x15]     // encoding: [0xe3,0x85,0x01,0x0d]
-// ERROR-NO-RCPC3:  [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stl1  { v3.d }[1], [sp]
-// CHECK:      stl1  { v3.d }[1], [sp]      // encoding: [0xe3,0x87,0x01,0x4d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               stl1  { v3.d }[1], [sp, 0]
-// CHECK:      stl1  { v3.d }[1], [sp]      // encoding: [0xe3,0x87,0x01,0x4d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldap1 { v3.d }[0], [sp]
-// CHECK:      ldap1 { v3.d }[0], [sp]      // encoding: [0xe3,0x87,0x41,0x0d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldap1 { v3.d }[0], [sp, #0]
-// CHECK:      ldap1 { v3.d }[0], [sp]      // encoding: [0xe3,0x87,0x41,0x0d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldap1 { v3.d }[1], [x15]
-// CHECK:      ldap1 { v3.d }[1], [x15]     // encoding: [0xe3,0x85,0x41,0x4d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
-               ldap1 { v3.d }[1], [x15, 0]
-// CHECK:      ldap1 { v3.d }[1], [x15]     // encoding: [0xe3,0x85,0x41,0x4d]
-// ERROR-NO-RCPC3: [[@LINE-2]]:16: error: instruction requires: rcpc3
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+rcpc3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+rcpc3,+v8.9a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+rcpc3,+v9.4a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+rcpc3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+rcpc3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+rcpc3 < %s \
+// RUN:   | llvm-objdump -d --mattr=-rcpc3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+rcpc3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+rcpc3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+stilp   w24, w0, [x16, #-8]!
+// CHECK-INST: stilp w24, w0, [x16, #-8]!
+// CHECK-ENCODING: encoding: [0x18,0x0a,0x00,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99000a18      <unknown>
+
+stilp   w24, w0, [x16,  -8]!
+// CHECK-INST: stilp w24, w0, [x16, #-8]!
+// CHECK-ENCODING: encoding: [0x18,0x0a,0x00,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99000a18      <unknown>
+
+stilp   x25, x1, [x17,  -16]!
+// CHECK-INST: stilp x25, x1, [x17, #-16]!
+// CHECK-ENCODING: encoding: [0x39,0x0a,0x01,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9010a39      <unknown>
+
+stilp   x25, x1, [x17, #-16]!
+// CHECK-INST: stilp x25, x1, [x17, #-16]!
+// CHECK-ENCODING: encoding: [0x39,0x0a,0x01,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9010a39      <unknown>
+
+stilp   w26, w2, [x18]
+// CHECK-INST: stilp w26, w2, [x18]
+// CHECK-ENCODING: encoding: [0x5a,0x1a,0x02,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99021a5a      <unknown>
+
+stilp   w26, w2, [x18, #0]
+// CHECK-INST: stilp w26, w2, [x18]
+// CHECK-ENCODING: encoding: [0x5a,0x1a,0x02,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99021a5a      <unknown>
+
+stilp   x27, x3, [sp]
+// CHECK-INST: stilp x27, x3, [sp]
+// CHECK-ENCODING: encoding: [0xfb,0x1b,0x03,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9031bfb      <unknown>
+
+stilp   x27, x3, [sp, 0]
+// CHECK-INST: stilp x27, x3, [sp]
+// CHECK-ENCODING: encoding: [0xfb,0x1b,0x03,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9031bfb      <unknown>
+
+ldiapp  w28, w4, [x20], #8
+// CHECK-INST: ldiapp w28, w4, [x20], #8
+// CHECK-ENCODING: encoding: [0x9c,0x0a,0x44,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99440a9c      <unknown>
+
+ldiapp  w28, w4, [x20, #0], #8
+// CHECK-INST: ldiapp w28, w4, [x20], #8
+// CHECK-ENCODING: encoding: [0x9c,0x0a,0x44,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99440a9c      <unknown>
+
+ldiapp  w28, w4, [x20],  8
+// CHECK-INST: ldiapp w28, w4, [x20], #8
+// CHECK-ENCODING: encoding: [0x9c,0x0a,0x44,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99440a9c      <unknown>
+
+ldiapp  w28, w4, [x20, 0],  8
+// CHECK-INST: ldiapp w28, w4, [x20], #8
+// CHECK-ENCODING: encoding: [0x9c,0x0a,0x44,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99440a9c      <unknown>
+
+ldiapp  x29, x5, [x21], #16
+// CHECK-INST: ldiapp x29, x5, [x21], #16
+// CHECK-ENCODING: encoding: [0xbd,0x0a,0x45,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9450abd      <unknown>
+
+ldiapp  x29, x5, [x21],  16
+// CHECK-INST: ldiapp x29, x5, [x21], #16
+// CHECK-ENCODING: encoding: [0xbd,0x0a,0x45,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9450abd      <unknown>
+
+ldiapp  w30, w6, [sp]
+// CHECK-INST: ldiapp w30, w6, [sp]
+// CHECK-ENCODING: encoding: [0xfe,0x1b,0x46,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99461bfe      <unknown>
+
+ldiapp  w30, w6, [sp, #0]
+// CHECK-INST: ldiapp w30, w6, [sp]
+// CHECK-ENCODING: encoding: [0xfe,0x1b,0x46,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99461bfe      <unknown>
+
+ldiapp  xzr, x7, [x23]
+// CHECK-INST: ldiapp xzr, x7, [x23]
+// CHECK-ENCODING: encoding: [0xff,0x1a,0x47,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9471aff      <unknown>
+
+ldiapp  xzr, x7, [x23, 0]
+// CHECK-INST: ldiapp xzr, x7, [x23]
+// CHECK-ENCODING: encoding: [0xff,0x1a,0x47,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9471aff      <unknown>
+
+stlr w3, [x15, #-4]!
+// CHECK-INST: stlr w3, [x15, #-4]!
+// CHECK-ENCODING: encoding: [0xe3,0x09,0x80,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  998009e3      <unknown>
+
+stlr w3, [x15,  -4]!
+// CHECK-INST: stlr w3, [x15, #-4]!
+// CHECK-ENCODING: encoding: [0xe3,0x09,0x80,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  998009e3      <unknown>
+
+stlr x3, [x15, #-8]!
+// CHECK-INST: stlr x3, [x15, #-8]!
+// CHECK-ENCODING: encoding: [0xe3,0x09,0x80,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d98009e3      <unknown>
+
+stlr x3, [sp,  -8]!
+// CHECK-INST: stlr x3, [sp, #-8]!
+// CHECK-ENCODING: encoding: [0xe3,0x0b,0x80,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9800be3      <unknown>
+
+ldapr w3, [sp], #4
+// CHECK-INST: ldapr w3, [sp], #4
+// CHECK-ENCODING: encoding: [0xe3,0x0b,0xc0,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99c00be3      <unknown>
+
+ldapr w3, [x15], 4
+// CHECK-INST: ldapr w3, [x15], #4
+// CHECK-ENCODING: encoding: [0xe3,0x09,0xc0,0x99]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  99c009e3      <unknown>
+
+ldapr x3, [x15], #8
+// CHECK-INST: ldapr x3, [x15], #8
+// CHECK-ENCODING: encoding: [0xe3,0x09,0xc0,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9c009e3      <unknown>
+
+ldapr x3, [x15], 8
+// CHECK-INST: ldapr x3, [x15], #8
+// CHECK-ENCODING: encoding: [0xe3,0x09,0xc0,0xd9]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  d9c009e3      <unknown>
+
+stlur b3, [x15, #-1]
+// CHECK-INST: stlur b3, [x15, #-1]
+// CHECK-ENCODING: encoding: [0xe3,0xf9,0x1f,0x1d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  1d1ff9e3      <unknown>
+
+stlur h3, [x15, #2]
+// CHECK-INST: stlur h3, [x15, #2]
+// CHECK-ENCODING: encoding: [0xe3,0x29,0x00,0x5d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  5d0029e3      <unknown>
+
+stlur s3, [x15, #-3]
+// CHECK-INST: stlur s3, [x15, #-3]
+// CHECK-ENCODING: encoding: [0xe3,0xd9,0x1f,0x9d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  9d1fd9e3      <unknown>
+
+stlur d3, [sp, #4]
+// CHECK-INST: stlur d3, [sp, #4]
+// CHECK-ENCODING: encoding: [0xe3,0x4b,0x00,0xdd]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  dd004be3      <unknown>
+
+stlur q3, [x15, #-5]
+// CHECK-INST: stlur q3, [x15, #-5]
+// CHECK-ENCODING: encoding: [0xe3,0xb9,0x9f,0x1d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  1d9fb9e3      <unknown>
+
+ldapur b3, [x15, #6]
+// CHECK-INST: ldapur b3, [x15, #6]
+// CHECK-ENCODING: encoding: [0xe3,0x69,0x40,0x1d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  1d4069e3      <unknown>
+
+ldapur h3, [x15, #-7]
+// CHECK-INST: ldapur h3, [x15, #-7]
+// CHECK-ENCODING: encoding: [0xe3,0x99,0x5f,0x5d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  5d5f99e3      <unknown>
+
+ldapur s3, [x15, #8]
+// CHECK-INST: ldapur s3, [x15, #8]
+// CHECK-ENCODING: encoding: [0xe3,0x89,0x40,0x9d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  9d4089e3      <unknown>
+
+ldapur d3, [x15, #-9]
+// CHECK-INST: ldapur d3, [x15, #-9]
+// CHECK-ENCODING: encoding: [0xe3,0x79,0x5f,0xdd]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  dd5f79e3      <unknown>
+
+ldapur q3, [sp, #10]
+// CHECK-INST: ldapur q3, [sp, #10]
+// CHECK-ENCODING: encoding: [0xe3,0xab,0xc0,0x1d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  1dc0abe3      <unknown>
+
+stl1  { v3.d }[0], [x15]
+// CHECK-INST: stl1 { v3.d }[0], [x15]
+// CHECK-ENCODING: encoding: [0xe3,0x85,0x01,0x0d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  0d0185e3      <unknown>
+
+stl1  { v3.d }[0], [x15, #0]
+// CHECK-INST: stl1 { v3.d }[0], [x15]
+// CHECK-ENCODING: encoding: [0xe3,0x85,0x01,0x0d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  0d0185e3      <unknown>
+
+stl1  { v3.d }[1], [sp]
+// CHECK-INST: stl1 { v3.d }[1], [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x87,0x01,0x4d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  4d0187e3      <unknown>
+
+stl1  { v3.d }[1], [sp, 0]
+// CHECK-INST: stl1 { v3.d }[1], [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x87,0x01,0x4d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  4d0187e3      <unknown>
+
+ldap1 { v3.d }[0], [sp]
+// CHECK-INST: ldap1 { v3.d }[0], [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x87,0x41,0x0d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  0d4187e3      <unknown>
+
+ldap1 { v3.d }[0], [sp, #0]
+// CHECK-INST: ldap1 { v3.d }[0], [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x87,0x41,0x0d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  0d4187e3      <unknown>
+
+ldap1 { v3.d }[1], [x15]
+// CHECK-INST: ldap1 { v3.d }[1], [x15]
+// CHECK-ENCODING: encoding: [0xe3,0x85,0x41,0x4d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  4d4185e3      <unknown>
+
+ldap1 { v3.d }[1], [x15, 0]
+// CHECK-INST: ldap1 { v3.d }[1], [x15]
+// CHECK-ENCODING: encoding: [0xe3,0x85,0x41,0x4d]
+// CHECK-ERROR:error: instruction requires: rcpc3
+// CHECK-UNKNOWN:  4d4185e3      <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.9a-specres2.s b/llvm/test/MC/AArch64/armv8.9a-specres2.s
index b411ec3..b79124d 100644
--- a/llvm/test/MC/AArch64/armv8.9a-specres2.s
+++ b/llvm/test/MC/AArch64/armv8.9a-specres2.s
@@ -1,13 +1,32 @@
-// RUN:     llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+specres2 < %s      | FileCheck %s
-// RUN:     llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.9a    < %s      | FileCheck %s
-// RUN:     llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.4a    < %s      | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-specres2 < %s 2>&1 | FileCheck %s --check-prefix=NOSPECRES2
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.9a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v9.4a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+specres2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+specres2 < %s \
+// RUN:        | llvm-objdump -d --mattr=+specres2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+specres2 < %s \
+// RUN:   | llvm-objdump -d --mattr=-specres2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+specres2 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+specres2 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 cosp rctx, x0
+// CHECK-INST: cosp rctx, x0
+// CHECK-ENCODING: encoding: [0xc0,0x73,0x0b,0xd5]
+// CHECK-ERROR: error: COSP requires: predres2
+// CHECK-UNKNOWN:  d50b73c0      sys #3, c7, c3, #6, x0
+
 sys #3, c7, c3, #6, x0
+// CHECK-INST: cosp rctx, x0
+// CHECK-ENCODING: encoding: [0xc0,0x73,0x0b,0xd5]
+// CHECK-UNKNOWN:  d50b73c0      sys #3, c7, c3, #6, x0
 
-// CHECK: cosp rctx, x0          // encoding: [0xc0,0x73,0x0b,0xd5]
-// CHECK: cosp rctx, x0          // encoding: [0xc0,0x73,0x0b,0xd5]
 
-// NOSPECRES2: COSP requires: predres2
-// NOSPECRES2-NEXT: cosp
diff --git a/llvm/test/MC/AArch64/armv8.9a-the-diagnostics.s b/llvm/test/MC/AArch64/armv8.9a-the-diagnostics.s
new file mode 100644
index 0000000..0fccbe1
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv8.9a-the-diagnostics.s
@@ -0,0 +1,103 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+the -mattr=+d128 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ZXR %s
+
+rcwswpp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswpp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwswppl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+
+rcwclrp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwclrpl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+
+rcwsetp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsetpl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+
+rcwsswpp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswpp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsswppl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+
+rcwsclrp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwsclrpl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+
+rcwssetp   xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpa  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpal xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpl  xzr, x5, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetp   x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpa  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpal x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
+rcwssetpl  x5, xzr, [x4]
+// ERROR-NO-ZXR:   error: invalid operand for instruction
diff --git a/llvm/test/MC/AArch64/armv8.9a-the.s b/llvm/test/MC/AArch64/armv8.9a-the.s
index 33e1b5d..689b6c9 100644
--- a/llvm/test/MC/AArch64/armv8.9a-the.s
+++ b/llvm/test/MC/AArch64/armv8.9a-the.s
@@ -1,592 +1,843 @@
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding               -mattr=+the -mattr=+d128 < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.9a -mattr=+the -mattr=+d128 < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v9.4a -mattr=+the -mattr=+d128 < %s | FileCheck %s
-
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu                           < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-THE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.9a             < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-THE %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v9.4a             < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-THE %s
-
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu               -mattr=+the < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-D128 %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.9a -mattr=+the < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-D128 %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v9.4a -mattr=+the < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-D128 %s
-
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+the -mattr=+d128 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-ZXR %s
-
-            mrs x3, RCWMASK_EL1
-// CHECK:   mrs x3, RCWMASK_EL1   // encoding: [0xc3,0xd0,0x38,0xd5]
-// ERROR-NO-THE: [[@LINE-2]]:21: error: expected readable system register
-            msr RCWMASK_EL1, x1
-// CHECK:   msr RCWMASK_EL1, x1   // encoding: [0xc1,0xd0,0x18,0xd5]
-// ERROR-NO-THE: [[@LINE-2]]:17: error: expected writable system register or pstate
-            mrs x3, RCWSMASK_EL1
-// CHECK:   mrs x3, RCWSMASK_EL1  // encoding: [0x63,0xd0,0x38,0xd5]
-// ERROR-NO-THE: [[@LINE-2]]:21: error: expected readable system register
-            msr RCWSMASK_EL1, x1
-// CHECK:   msr RCWSMASK_EL1, x1  // encoding: [0x61,0xd0,0x18,0xd5]
-// ERROR-NO-THE: [[@LINE-2]]:17: error: expected writable system register or pstate
-
-            rcwcas   x0, x1, [x4]
-// CHECK:   rcwcas   x0, x1, [x4] // encoding: [0x81,0x08,0x20,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasa  x0, x1, [x4]
-// CHECK:   rcwcasa  x0, x1, [x4] // encoding: [0x81,0x08,0xa0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasal x0, x1, [x4]
-// CHECK:   rcwcasal x0, x1, [x4] // encoding: [0x81,0x08,0xe0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasl  x0, x1, [x4]
-// CHECK:   rcwcasl  x0, x1, [x4] // encoding: [0x81,0x08,0x60,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcas   x3, x5, [sp]
-// CHECK:   rcwcas   x3, x5, [sp] // encoding: [0xe5,0x0b,0x23,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasa  x3, x5, [sp]
-// CHECK:   rcwcasa  x3, x5, [sp] // encoding: [0xe5,0x0b,0xa3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasal x3, x5, [sp]
-// CHECK:   rcwcasal x3, x5, [sp] // encoding: [0xe5,0x0b,0xe3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwcasl  x3, x5, [sp]
-// CHECK:   rcwcasl  x3, x5, [sp] // encoding: [0xe5,0x0b,0x63,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwscas   x0, x1, [x4]
-// CHECK:   rcwscas   x0, x1, [x4] // encoding: [0x81,0x08,0x20,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasa  x0, x1, [x4]
-// CHECK:   rcwscasa  x0, x1, [x4] // encoding: [0x81,0x08,0xa0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasal x0, x1, [x4]
-// CHECK:   rcwscasal x0, x1, [x4] // encoding: [0x81,0x08,0xe0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasl  x0, x1, [x4]
-// CHECK:   rcwscasl  x0, x1, [x4] // encoding: [0x81,0x08,0x60,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscas   x3, x5, [sp]
-// CHECK:   rcwscas   x3, x5, [sp] // encoding: [0xe5,0x0b,0x23,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasa  x3, x5, [sp]
-// CHECK:   rcwscasa  x3, x5, [sp] // encoding: [0xe5,0x0b,0xa3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasal x3, x5, [sp]
-// CHECK:   rcwscasal x3, x5, [sp] // encoding: [0xe5,0x0b,0xe3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwscasl  x3, x5, [sp]
-// CHECK:   rcwscasl  x3, x5, [sp] // encoding: [0xe5,0x0b,0x63,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwcasp   x0, x1, x6, x7, [x4]
-// CHECK:   rcwcasp   x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0x20,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspa  x0, x1, x6, x7, [x4]
-// CHECK:   rcwcaspa  x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0xa0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspal x0, x1, x6, x7, [x4]
-// CHECK:   rcwcaspal x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0xe0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspl  x0, x1, x6, x7, [x4]
-// CHECK:   rcwcaspl  x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0x60,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcasp   x4, x5, x6, x7, [sp]
-// CHECK:   rcwcasp   x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0x24,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspa  x4, x5, x6, x7, [sp]
-// CHECK:   rcwcaspa  x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0xa4,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspal x4, x5, x6, x7, [sp]
-// CHECK:   rcwcaspal x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0xe4,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwcaspl  x4, x5, x6, x7, [sp]
-// CHECK:   rcwcaspl  x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0x64,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwscasp   x0, x1, x6, x7, [x4]
-// CHECK:   rcwscasp   x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0x20,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspa  x0, x1, x6, x7, [x4]
-// CHECK:   rcwscaspa  x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0xa0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspal x0, x1, x6, x7, [x4]
-// CHECK:   rcwscaspal x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0xe0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspl  x0, x1, x6, x7, [x4]
-// CHECK:   rcwscaspl  x0, x1, x6, x7, [x4] // encoding: [0x86,0x0c,0x60,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscasp   x4, x5, x6, x7, [sp]
-// CHECK:   rcwscasp   x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0x24,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspa  x4, x5, x6, x7, [sp]
-// CHECK:   rcwscaspa  x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0xa4,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspal x4, x5, x6, x7, [sp]
-// CHECK:   rcwscaspal x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0xe4,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwscaspl  x4, x5, x6, x7, [sp]
-// CHECK:   rcwscaspl  x4, x5, x6, x7, [sp] // encoding: [0xe6,0x0f,0x64,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwclr   x0, x1, [x4]
-// CHECK:   rcwclr   x0, x1, [x4] // encoding: [0x81,0x90,0x20,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclra  x0, x1, [x4]
-// CHECK:   rcwclra  x0, x1, [x4] // encoding: [0x81,0x90,0xa0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclral x0, x1, [x4]
-// CHECK:   rcwclral x0, x1, [x4] // encoding: [0x81,0x90,0xe0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclrl  x0, x1, [x4]
-// CHECK:   rcwclrl  x0, x1, [x4] // encoding: [0x81,0x90,0x60,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclr   x3, x5, [sp]
-// CHECK:   rcwclr   x3, x5, [sp] // encoding: [0xe5,0x93,0x23,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclra  x3, x5, [sp]
-// CHECK:   rcwclra  x3, x5, [sp] // encoding: [0xe5,0x93,0xa3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclral x3, x5, [sp]
-// CHECK:   rcwclral x3, x5, [sp] // encoding: [0xe5,0x93,0xe3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwclrl  x3, x5, [sp]
-// CHECK:   rcwclrl  x3, x5, [sp] // encoding: [0xe5,0x93,0x63,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwsclr   x0, x1, [x4]
-// CHECK:   rcwsclr   x0, x1, [x4] // encoding: [0x81,0x90,0x20,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclra  x0, x1, [x4]
-// CHECK:   rcwsclra  x0, x1, [x4] // encoding: [0x81,0x90,0xa0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclral x0, x1, [x4]
-// CHECK:   rcwsclral x0, x1, [x4] // encoding: [0x81,0x90,0xe0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclrl  x0, x1, [x4]
-// CHECK:   rcwsclrl  x0, x1, [x4] // encoding: [0x81,0x90,0x60,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclr   x3, x5, [sp]
-// CHECK:   rcwsclr   x3, x5, [sp] // encoding: [0xe5,0x93,0x23,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclra  x3, x5, [sp]
-// CHECK:   rcwsclra  x3, x5, [sp] // encoding: [0xe5,0x93,0xa3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclral x3, x5, [sp]
-// CHECK:   rcwsclral x3, x5, [sp] // encoding: [0xe5,0x93,0xe3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsclrl  x3, x5, [sp]
-// CHECK:   rcwsclrl  x3, x5, [sp] // encoding: [0xe5,0x93,0x63,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwclrp   x1, x0, [x4]
-// CHECK:   rcwclrp   x1, x0, [x4] // encoding: [0x81,0x90,0x20,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpa  x1, x0, [x4]
-// CHECK:   rcwclrpa  x1, x0, [x4] // encoding: [0x81,0x90,0xa0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpal x1, x0, [x4]
-// CHECK:   rcwclrpal x1, x0, [x4] // encoding: [0x81,0x90,0xe0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpl  x1, x0, [x4]
-// CHECK:   rcwclrpl  x1, x0, [x4] // encoding: [0x81,0x90,0x60,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrp   x5, x3, [sp]
-// CHECK:   rcwclrp   x5, x3, [sp] // encoding: [0xe5,0x93,0x23,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpa  x5, x3, [sp]
-// CHECK:   rcwclrpa  x5, x3, [sp] // encoding: [0xe5,0x93,0xa3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpal x5, x3, [sp]
-// CHECK:   rcwclrpal x5, x3, [sp] // encoding: [0xe5,0x93,0xe3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwclrpl  x5, x3, [sp]
-// CHECK:   rcwclrpl  x5, x3, [sp] // encoding: [0xe5,0x93,0x63,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwsclrp   x1, x0, [x4]
-// CHECK:   rcwsclrp   x1, x0, [x4] // encoding: [0x81,0x90,0x20,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpa  x1, x0, [x4]
-// CHECK:   rcwsclrpa  x1, x0, [x4] // encoding: [0x81,0x90,0xa0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpal x1, x0, [x4]
-// CHECK:   rcwsclrpal x1, x0, [x4] // encoding: [0x81,0x90,0xe0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpl  x1, x0, [x4]
-// CHECK:   rcwsclrpl  x1, x0, [x4] // encoding: [0x81,0x90,0x60,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrp   x5, x3, [sp]
-// CHECK:   rcwsclrp   x5, x3, [sp] // encoding: [0xe5,0x93,0x23,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpa  x5, x3, [sp]
-// CHECK:   rcwsclrpa  x5, x3, [sp] // encoding: [0xe5,0x93,0xa3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpal x5, x3, [sp]
-// CHECK:   rcwsclrpal x5, x3, [sp] // encoding: [0xe5,0x93,0xe3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsclrpl  x5, x3, [sp]
-// CHECK:   rcwsclrpl  x5, x3, [sp] // encoding: [0xe5,0x93,0x63,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwset   x0, x1, [x4]
-// CHECK:   rcwset   x0, x1, [x4] // encoding: [0x81,0xb0,0x20,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwseta  x0, x1, [x4]
-// CHECK:   rcwseta  x0, x1, [x4] // encoding: [0x81,0xb0,0xa0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsetal x0, x1, [x4]
-// CHECK:   rcwsetal x0, x1, [x4] // encoding: [0x81,0xb0,0xe0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsetl  x0, x1, [x4]
-// CHECK:   rcwsetl  x0, x1, [x4] // encoding: [0x81,0xb0,0x60,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwset   x3, x5, [sp]
-// CHECK:   rcwset   x3, x5, [sp] // encoding: [0xe5,0xb3,0x23,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwseta  x3, x5, [sp]
-// CHECK:   rcwseta  x3, x5, [sp] // encoding: [0xe5,0xb3,0xa3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsetal x3, x5, [sp]
-// CHECK:   rcwsetal x3, x5, [sp] // encoding: [0xe5,0xb3,0xe3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsetl  x3, x5, [sp]
-// CHECK:   rcwsetl  x3, x5, [sp] // encoding: [0xe5,0xb3,0x63,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwsset   x0, x1, [x4]
-// CHECK:   rcwsset   x0, x1, [x4] // encoding: [0x81,0xb0,0x20,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsseta  x0, x1, [x4]
-// CHECK:   rcwsseta  x0, x1, [x4] // encoding: [0x81,0xb0,0xa0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwssetal x0, x1, [x4]
-// CHECK:   rcwssetal x0, x1, [x4] // encoding: [0x81,0xb0,0xe0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwssetl  x0, x1, [x4]
-// CHECK:   rcwssetl  x0, x1, [x4] // encoding: [0x81,0xb0,0x60,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsset   x3, x5, [sp]
-// CHECK:   rcwsset   x3, x5, [sp] // encoding: [0xe5,0xb3,0x23,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsseta  x3, x5, [sp]
-// CHECK:   rcwsseta  x3, x5, [sp] // encoding: [0xe5,0xb3,0xa3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwssetal x3, x5, [sp]
-// CHECK:   rcwssetal x3, x5, [sp] // encoding: [0xe5,0xb3,0xe3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwssetl  x3, x5, [sp]
-// CHECK:   rcwssetl  x3, x5, [sp] // encoding: [0xe5,0xb3,0x63,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwsetp   x1, x0, [x4]
-// CHECK:   rcwsetp   x1, x0, [x4] // encoding: [0x81,0xb0,0x20,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpa  x1, x0, [x4]
-// CHECK:   rcwsetpa  x1, x0, [x4] // encoding: [0x81,0xb0,0xa0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpal x1, x0, [x4]
-// CHECK:   rcwsetpal x1, x0, [x4] // encoding: [0x81,0xb0,0xe0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpl  x1, x0, [x4]
-// CHECK:   rcwsetpl  x1, x0, [x4] // encoding: [0x81,0xb0,0x60,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetp   x5, x3, [sp]
-// CHECK:   rcwsetp   x5, x3, [sp] // encoding: [0xe5,0xb3,0x23,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpa  x5, x3, [sp]
-// CHECK:   rcwsetpa  x5, x3, [sp] // encoding: [0xe5,0xb3,0xa3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpal x5, x3, [sp]
-// CHECK:   rcwsetpal x5, x3, [sp] // encoding: [0xe5,0xb3,0xe3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsetpl  x5, x3, [sp]
-// CHECK:   rcwsetpl  x5, x3, [sp] // encoding: [0xe5,0xb3,0x63,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwssetp   x1, x0, [x4]
-// CHECK:   rcwssetp   x1, x0, [x4] // encoding: [0x81,0xb0,0x20,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpa  x1, x0, [x4]
-// CHECK:   rcwssetpa  x1, x0, [x4] // encoding: [0x81,0xb0,0xa0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpal x1, x0, [x4]
-// CHECK:   rcwssetpal x1, x0, [x4] // encoding: [0x81,0xb0,0xe0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpl  x1, x0, [x4]
-// CHECK:   rcwssetpl  x1, x0, [x4] // encoding: [0x81,0xb0,0x60,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetp   x5, x3, [sp]
-// CHECK:   rcwssetp   x5, x3, [sp] // encoding: [0xe5,0xb3,0x23,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpa  x5, x3, [sp]
-// CHECK:   rcwssetpa  x5, x3, [sp] // encoding: [0xe5,0xb3,0xa3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpal x5, x3, [sp]
-// CHECK:   rcwssetpal x5, x3, [sp] // encoding: [0xe5,0xb3,0xe3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwssetpl  x5, x3, [sp]
-// CHECK:   rcwssetpl  x5, x3, [sp] // encoding: [0xe5,0xb3,0x63,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwswp   x0, x1, [x4]
-// CHECK:   rcwswp   x0, x1, [x4] // encoding: [0x81,0xa0,0x20,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpa  x0, x1, [x4]
-// CHECK:   rcwswpa  x0, x1, [x4] // encoding: [0x81,0xa0,0xa0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpal x0, x1, [x4]
-// CHECK:   rcwswpal x0, x1, [x4] // encoding: [0x81,0xa0,0xe0,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpl  x0, x1, [x4]
-// CHECK:   rcwswpl  x0, x1, [x4] // encoding: [0x81,0xa0,0x60,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswp   x3, x5, [sp]
-// CHECK:   rcwswp   x3, x5, [sp] // encoding: [0xe5,0xa3,0x23,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpa  x3, x5, [sp]
-// CHECK:   rcwswpa  x3, x5, [sp] // encoding: [0xe5,0xa3,0xa3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpal x3, x5, [sp]
-// CHECK:   rcwswpal x3, x5, [sp] // encoding: [0xe5,0xa3,0xe3,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwswpl  x3, x5, [sp]
-// CHECK:   rcwswpl  x3, x5, [sp] // encoding: [0xe5,0xa3,0x63,0x38]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwsswp   x0, x1, [x4]
-// CHECK:   rcwsswp   x0, x1, [x4] // encoding: [0x81,0xa0,0x20,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpa  x0, x1, [x4]
-// CHECK:   rcwsswpa  x0, x1, [x4] // encoding: [0x81,0xa0,0xa0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpal x0, x1, [x4]
-// CHECK:   rcwsswpal x0, x1, [x4] // encoding: [0x81,0xa0,0xe0,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpl  x0, x1, [x4]
-// CHECK:   rcwsswpl  x0, x1, [x4] // encoding: [0x81,0xa0,0x60,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswp   x3, x5, [sp]
-// CHECK:   rcwsswp   x3, x5, [sp] // encoding: [0xe5,0xa3,0x23,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpa  x3, x5, [sp]
-// CHECK:   rcwsswpa  x3, x5, [sp] // encoding: [0xe5,0xa3,0xa3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpal x3, x5, [sp]
-// CHECK:   rcwsswpal x3, x5, [sp] // encoding: [0xe5,0xa3,0xe3,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-            rcwsswpl  x3, x5, [sp]
-// CHECK:   rcwsswpl  x3, x5, [sp] // encoding: [0xe5,0xa3,0x63,0x78]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: the
-
-            rcwswpp   x1, x0, [x4]
-// CHECK:   rcwswpp   x1, x0, [x4] // encoding: [0x81,0xa0,0x20,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppa  x1, x0, [x4]
-// CHECK:   rcwswppa  x1, x0, [x4] // encoding: [0x81,0xa0,0xa0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppal x1, x0, [x4]
-// CHECK:   rcwswppal x1, x0, [x4] // encoding: [0x81,0xa0,0xe0,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppl  x1, x0, [x4]
-// CHECK:   rcwswppl  x1, x0, [x4] // encoding: [0x81,0xa0,0x60,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswpp   x5, x3, [sp]
-// CHECK:   rcwswpp   x5, x3, [sp] // encoding: [0xe5,0xa3,0x23,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppa  x5, x3, [sp]
-// CHECK:   rcwswppa  x5, x3, [sp] // encoding: [0xe5,0xa3,0xa3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppal x5, x3, [sp]
-// CHECK:   rcwswppal x5, x3, [sp] // encoding: [0xe5,0xa3,0xe3,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwswppl  x5, x3, [sp]
-// CHECK:   rcwswppl  x5, x3, [sp] // encoding: [0xe5,0xa3,0x63,0x19]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwsswpp   x1, x0, [x4]
-// CHECK:   rcwsswpp   x1, x0, [x4] // encoding: [0x81,0xa0,0x20,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppa  x1, x0, [x4]
-// CHECK:   rcwsswppa  x1, x0, [x4] // encoding: [0x81,0xa0,0xa0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppal x1, x0, [x4]
-// CHECK:   rcwsswppal x1, x0, [x4] // encoding: [0x81,0xa0,0xe0,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppl  x1, x0, [x4]
-// CHECK:   rcwsswppl  x1, x0, [x4] // encoding: [0x81,0xa0,0x60,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswpp   x5, x3, [sp]
-// CHECK:   rcwsswpp   x5, x3, [sp] // encoding: [0xe5,0xa3,0x23,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppa  x5, x3, [sp]
-// CHECK:   rcwsswppa  x5, x3, [sp] // encoding: [0xe5,0xa3,0xa3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppal x5, x3, [sp]
-// CHECK:   rcwsswppal x5, x3, [sp] // encoding: [0xe5,0xa3,0xe3,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-            rcwsswppl  x5, x3, [sp]
-// CHECK:   rcwsswppl  x5, x3, [sp] // encoding: [0xe5,0xa3,0x63,0x59]
-// ERROR-NO-THE: [[@LINE-2]]:13: error: instruction requires: d128 the
-// ERROR-NO-D128: [[@LINE-3]]:13: error: instruction requires: d128
-
-            rcwswpp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwswppa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwswppal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwswppl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwswpp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwswppa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwswppal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwswppl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-
-            rcwclrp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwclrpa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwclrpal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwclrpl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwclrp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwclrpa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwclrpal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwclrpl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-
-            rcwsetp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwsetpa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwsetpal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwsetpl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:23: error: invalid operand for instruction
-            rcwsetp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwsetpa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwsetpal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-            rcwsetpl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:27: error: invalid operand for instruction
-
-            rcwsswpp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsswppa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsswppal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsswppl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsswpp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsswppa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsswppal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsswppl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-
-            rcwsclrp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsclrpa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsclrpal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsclrpl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwsclrp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsclrpa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsclrpal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwsclrpl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-
-            rcwssetp   xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwssetpa  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwssetpal xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwssetpl  xzr, x5, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:24: error: invalid operand for instruction
-            rcwssetp   x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwssetpa  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwssetpal x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
-            rcwssetpl  x5, xzr, [x4]
-// ERROR-NO-ZXR:   [[@LINE-1]]:28: error: invalid operand for instruction
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+the,+d128 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+the,+d128,v8.9a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+the,+d128,v9.4a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+the,+d128 < %s \
+// RUN:        | llvm-objdump -d --mattr=+the,+d128 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+the,+d128 < %s \
+// RUN:   | llvm-objdump -d --mattr=-the,-d128 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+the,+d128 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+the,+d128 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -mattr=+the < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-D128 %s
+
+
+mrs x3, RCWMASK_EL1
+// CHECK-INST: mrs x3, RCWMASK_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xd0,0x38,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d538d0c3      mrs x3, S3_0_C13_C0_6
+
+msr RCWMASK_EL1, x1
+// CHECK-INST: msr RCWMASK_EL1, x1
+// CHECK-ENCODING: encoding: [0xc1,0xd0,0x18,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d518d0c1      msr S3_0_C13_C0_6, x1
+
+mrs x3, RCWSMASK_EL1
+// CHECK-INST: mrs x3, RCWSMASK_EL1
+// CHECK-ENCODING: encoding: [0x63,0xd0,0x38,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d538d063      mrs x3, S3_0_C13_C0_3
+
+msr RCWSMASK_EL1, x1
+// CHECK-INST: msr RCWSMASK_EL1, x1
+// CHECK-ENCODING: encoding: [0x61,0xd0,0x18,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d518d061      msr S3_0_C13_C0_3, x1
+
+rcwcas   x0, x1, [x4]
+// CHECK-INST: rcwcas x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19200881      <unknown>
+
+rcwcasa  x0, x1, [x4]
+// CHECK-INST: rcwcasa x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0xa0,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19a00881      <unknown>
+
+rcwcasal x0, x1, [x4]
+// CHECK-INST: rcwcasal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0xe0,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19e00881      <unknown>
+
+rcwcasl  x0, x1, [x4]
+// CHECK-INST: rcwcasl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19600881      <unknown>
+
+rcwcas   x3, x5, [sp]
+// CHECK-INST: rcwcas x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0x23,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19230be5      <unknown>
+
+rcwcasa  x3, x5, [sp]
+// CHECK-INST: rcwcasa x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0xa3,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19a30be5      <unknown>
+
+rcwcasal x3, x5, [sp]
+// CHECK-INST: rcwcasal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0xe3,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19e30be5      <unknown>
+
+rcwcasl  x3, x5, [sp]
+// CHECK-INST: rcwcasl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0x63,0x19]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  19630be5      <unknown>
+
+rcwscas   x0, x1, [x4]
+// CHECK-INST: rcwscas x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59200881      <unknown>
+
+rcwscasa  x0, x1, [x4]
+// CHECK-INST: rcwscasa x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0xa0,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59a00881      <unknown>
+
+rcwscasal x0, x1, [x4]
+// CHECK-INST: rcwscasal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0xe0,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59e00881      <unknown>
+
+rcwscasl  x0, x1, [x4]
+// CHECK-INST: rcwscasl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x08,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59600881      <unknown>
+
+rcwscas   x3, x5, [sp]
+// CHECK-INST: rcwscas x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0x23,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59230be5      <unknown>
+
+rcwscasa  x3, x5, [sp]
+// CHECK-INST: rcwscasa x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0xa3,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59a30be5      <unknown>
+
+rcwscasal x3, x5, [sp]
+// CHECK-INST: rcwscasal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0xe3,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59e30be5      <unknown>
+
+rcwscasl  x3, x5, [sp]
+// CHECK-INST: rcwscasl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x0b,0x63,0x59]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  59630be5      <unknown>
+
+rcwcasp   x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwcasp x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19200c86      <unknown>
+
+rcwcaspa  x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwcaspa x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0xa0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a00c86      <unknown>
+
+rcwcaspal x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwcaspal x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0xe0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e00c86      <unknown>
+
+rcwcaspl  x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwcaspl x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19600c86      <unknown>
+
+rcwcasp   x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwcasp x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0x24,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19240fe6      <unknown>
+
+rcwcaspa  x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwcaspa x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0xa4,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a40fe6      <unknown>
+
+rcwcaspal x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwcaspal x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0xe4,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e40fe6      <unknown>
+
+rcwcaspl  x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwcaspl x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0x64,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19640fe6      <unknown>
+
+rcwscasp   x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwscasp x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59200c86      <unknown>
+
+rcwscaspa  x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwscaspa x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0xa0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a00c86      <unknown>
+
+rcwscaspal x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwscaspal x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0xe0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e00c86      <unknown>
+
+rcwscaspl  x0, x1, x6, x7, [x4]
+// CHECK-INST: rcwscaspl x0, x1, x6, x7, [x4]
+// CHECK-ENCODING: encoding: [0x86,0x0c,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59600c86      <unknown>
+
+rcwscasp   x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwscasp x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0x24,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59240fe6      <unknown>
+
+rcwscaspa  x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwscaspa x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0xa4,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a40fe6      <unknown>
+
+rcwscaspal x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwscaspal x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0xe4,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e40fe6      <unknown>
+
+rcwscaspl  x4, x5, x6, x7, [sp]
+// CHECK-INST: rcwscaspl x4, x5, x6, x7, [sp]
+// CHECK-ENCODING: encoding: [0xe6,0x0f,0x64,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59640fe6      <unknown>
+
+rcwclr   x0, x1, [x4]
+// CHECK-INST: rcwclr x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x20,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38209081      <unknown>
+
+rcwclra  x0, x1, [x4]
+// CHECK-INST: rcwclra x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xa0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a09081      <unknown>
+
+rcwclral x0, x1, [x4]
+// CHECK-INST: rcwclral x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xe0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e09081      <unknown>
+
+rcwclrl  x0, x1, [x4]
+// CHECK-INST: rcwclrl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x60,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38609081      <unknown>
+
+rcwclr   x3, x5, [sp]
+// CHECK-INST: rcwclr x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x23,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  382393e5      <unknown>
+
+rcwclra  x3, x5, [sp]
+// CHECK-INST: rcwclra x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xa3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a393e5      <unknown>
+
+rcwclral x3, x5, [sp]
+// CHECK-INST: rcwclral x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xe3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e393e5      <unknown>
+
+rcwclrl  x3, x5, [sp]
+// CHECK-INST: rcwclrl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x63,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  386393e5      <unknown>
+
+rcwsclr   x0, x1, [x4]
+// CHECK-INST: rcwsclr x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x20,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78209081      <unknown>
+
+rcwsclra  x0, x1, [x4]
+// CHECK-INST: rcwsclra x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xa0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a09081      <unknown>
+
+rcwsclral x0, x1, [x4]
+// CHECK-INST: rcwsclral x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xe0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e09081      <unknown>
+
+rcwsclrl  x0, x1, [x4]
+// CHECK-INST: rcwsclrl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x60,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78609081      <unknown>
+
+rcwsclr   x3, x5, [sp]
+// CHECK-INST: rcwsclr x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x23,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  782393e5      <unknown>
+
+rcwsclra  x3, x5, [sp]
+// CHECK-INST: rcwsclra x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xa3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a393e5      <unknown>
+
+rcwsclral x3, x5, [sp]
+// CHECK-INST: rcwsclral x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xe3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e393e5      <unknown>
+
+rcwsclrl  x3, x5, [sp]
+// CHECK-INST: rcwsclrl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x63,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  786393e5      <unknown>
+
+rcwclrp   x1, x0, [x4]
+// CHECK-INST: rcwclrp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19209081      <unknown>
+
+rcwclrpa  x1, x0, [x4]
+// CHECK-INST: rcwclrpa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xa0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a09081      <unknown>
+
+rcwclrpal x1, x0, [x4]
+// CHECK-INST: rcwclrpal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xe0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e09081      <unknown>
+
+rcwclrpl  x1, x0, [x4]
+// CHECK-INST: rcwclrpl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19609081      <unknown>
+
+rcwclrp   x5, x3, [sp]
+// CHECK-INST: rcwclrp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x23,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  192393e5      <unknown>
+
+rcwclrpa  x5, x3, [sp]
+// CHECK-INST: rcwclrpa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xa3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a393e5      <unknown>
+
+rcwclrpal x5, x3, [sp]
+// CHECK-INST: rcwclrpal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xe3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e393e5      <unknown>
+
+rcwclrpl  x5, x3, [sp]
+// CHECK-INST: rcwclrpl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x63,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  196393e5      <unknown>
+
+rcwsclrp   x1, x0, [x4]
+// CHECK-INST: rcwsclrp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59209081      <unknown>
+
+rcwsclrpa  x1, x0, [x4]
+// CHECK-INST: rcwsclrpa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xa0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a09081      <unknown>
+
+rcwsclrpal x1, x0, [x4]
+// CHECK-INST: rcwsclrpal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0xe0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e09081      <unknown>
+
+rcwsclrpl  x1, x0, [x4]
+// CHECK-INST: rcwsclrpl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0x90,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59609081      <unknown>
+
+rcwsclrp   x5, x3, [sp]
+// CHECK-INST: rcwsclrp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x23,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  592393e5      <unknown>
+
+rcwsclrpa  x5, x3, [sp]
+// CHECK-INST: rcwsclrpa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xa3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a393e5      <unknown>
+
+rcwsclrpal x5, x3, [sp]
+// CHECK-INST: rcwsclrpal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0xe3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e393e5      <unknown>
+
+rcwsclrpl  x5, x3, [sp]
+// CHECK-INST: rcwsclrpl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0x93,0x63,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  596393e5      <unknown>
+
+rcwset   x0, x1, [x4]
+// CHECK-INST: rcwset x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x20,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3820b081      <unknown>
+
+rcwseta  x0, x1, [x4]
+// CHECK-INST: rcwseta x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xa0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a0b081      <unknown>
+
+rcwsetal x0, x1, [x4]
+// CHECK-INST: rcwsetal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xe0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e0b081      <unknown>
+
+rcwsetl  x0, x1, [x4]
+// CHECK-INST: rcwsetl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x60,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3860b081      <unknown>
+
+rcwset   x3, x5, [sp]
+// CHECK-INST: rcwset x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x23,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3823b3e5      <unknown>
+
+rcwseta  x3, x5, [sp]
+// CHECK-INST: rcwseta x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xa3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a3b3e5      <unknown>
+
+rcwsetal x3, x5, [sp]
+// CHECK-INST: rcwsetal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xe3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e3b3e5      <unknown>
+
+rcwsetl  x3, x5, [sp]
+// CHECK-INST: rcwsetl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x63,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3863b3e5      <unknown>
+
+rcwsset   x0, x1, [x4]
+// CHECK-INST: rcwsset x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x20,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7820b081      <unknown>
+
+rcwsseta  x0, x1, [x4]
+// CHECK-INST: rcwsseta x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xa0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a0b081      <unknown>
+
+rcwssetal x0, x1, [x4]
+// CHECK-INST: rcwssetal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xe0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e0b081      <unknown>
+
+rcwssetl  x0, x1, [x4]
+// CHECK-INST: rcwssetl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x60,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7860b081      <unknown>
+
+rcwsset   x3, x5, [sp]
+// CHECK-INST: rcwsset x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x23,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7823b3e5      <unknown>
+
+rcwsseta  x3, x5, [sp]
+// CHECK-INST: rcwsseta x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xa3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a3b3e5      <unknown>
+
+rcwssetal x3, x5, [sp]
+// CHECK-INST: rcwssetal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xe3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e3b3e5      <unknown>
+
+rcwssetl  x3, x5, [sp]
+// CHECK-INST: rcwssetl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x63,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7863b3e5      <unknown>
+
+rcwsetp   x1, x0, [x4]
+// CHECK-INST: rcwsetp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  1920b081      <unknown>
+
+rcwsetpa  x1, x0, [x4]
+// CHECK-INST: rcwsetpa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xa0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a0b081      <unknown>
+
+rcwsetpal x1, x0, [x4]
+// CHECK-INST: rcwsetpal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xe0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e0b081      <unknown>
+
+rcwsetpl  x1, x0, [x4]
+// CHECK-INST: rcwsetpl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  1960b081      <unknown>
+
+rcwsetp   x5, x3, [sp]
+// CHECK-INST: rcwsetp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x23,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  1923b3e5      <unknown>
+
+rcwsetpa  x5, x3, [sp]
+// CHECK-INST: rcwsetpa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xa3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19a3b3e5      <unknown>
+
+rcwsetpal x5, x3, [sp]
+// CHECK-INST: rcwsetpal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xe3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  19e3b3e5      <unknown>
+
+rcwsetpl  x5, x3, [sp]
+// CHECK-INST: rcwsetpl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x63,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  1963b3e5      <unknown>
+
+rcwssetp   x1, x0, [x4]
+// CHECK-INST: rcwssetp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  5920b081      <unknown>
+
+rcwssetpa  x1, x0, [x4]
+// CHECK-INST: rcwssetpa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xa0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  59a0b081      <unknown>
+
+rcwssetpal x1, x0, [x4]
+// CHECK-INST: rcwssetpal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0xe0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  59e0b081      <unknown>
+
+rcwssetpl  x1, x0, [x4]
+// CHECK-INST: rcwssetpl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xb0,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  5960b081      <unknown>
+
+rcwssetp   x5, x3, [sp]
+// CHECK-INST: rcwssetp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x23,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  5923b3e5      <unknown>
+
+rcwssetpa  x5, x3, [sp]
+// CHECK-INST: rcwssetpa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xa3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  59a3b3e5      <unknown>
+
+rcwssetpal x5, x3, [sp]
+// CHECK-INST: rcwssetpal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0xe3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  59e3b3e5      <unknown>
+
+rcwssetpl  x5, x3, [sp]
+// CHECK-INST: rcwssetpl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xb3,0x63,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// ERROR-NO-D128: error: instruction requires: d128
+// CHECK-UNKNOWN:  5963b3e5      <unknown>
+
+rcwswp   x0, x1, [x4]
+// CHECK-INST: rcwswp x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x20,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3820a081      <unknown>
+
+rcwswpa  x0, x1, [x4]
+// CHECK-INST: rcwswpa x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xa0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a0a081      <unknown>
+
+rcwswpal x0, x1, [x4]
+// CHECK-INST: rcwswpal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xe0,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e0a081      <unknown>
+
+rcwswpl  x0, x1, [x4]
+// CHECK-INST: rcwswpl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x60,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3860a081      <unknown>
+
+rcwswp   x3, x5, [sp]
+// CHECK-INST: rcwswp x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x23,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3823a3e5      <unknown>
+
+rcwswpa  x3, x5, [sp]
+// CHECK-INST: rcwswpa x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xa3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38a3a3e5      <unknown>
+
+rcwswpal x3, x5, [sp]
+// CHECK-INST: rcwswpal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xe3,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  38e3a3e5      <unknown>
+
+rcwswpl  x3, x5, [sp]
+// CHECK-INST: rcwswpl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x63,0x38]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  3863a3e5      <unknown>
+
+rcwsswp   x0, x1, [x4]
+// CHECK-INST: rcwsswp x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x20,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7820a081      <unknown>
+
+rcwsswpa  x0, x1, [x4]
+// CHECK-INST: rcwsswpa x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xa0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a0a081      <unknown>
+
+rcwsswpal x0, x1, [x4]
+// CHECK-INST: rcwsswpal x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xe0,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e0a081      <unknown>
+
+rcwsswpl  x0, x1, [x4]
+// CHECK-INST: rcwsswpl x0, x1, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x60,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7860a081      <unknown>
+
+rcwsswp   x3, x5, [sp]
+// CHECK-INST: rcwsswp x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x23,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7823a3e5      <unknown>
+
+rcwsswpa  x3, x5, [sp]
+// CHECK-INST: rcwsswpa x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xa3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78a3a3e5      <unknown>
+
+rcwsswpal x3, x5, [sp]
+// CHECK-INST: rcwsswpal x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xe3,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  78e3a3e5      <unknown>
+
+rcwsswpl  x3, x5, [sp]
+// CHECK-INST: rcwsswpl x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x63,0x78]
+// CHECK-ERROR: error: instruction requires: the
+// CHECK-UNKNOWN:  7863a3e5      <unknown>
+
+rcwswpp   x1, x0, [x4]
+// CHECK-INST: rcwswpp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  1920a081      <unknown>
+
+rcwswppa  x1, x0, [x4]
+// CHECK-INST: rcwswppa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xa0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  19a0a081      <unknown>
+
+rcwswppal x1, x0, [x4]
+// CHECK-INST: rcwswppal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xe0,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  19e0a081      <unknown>
+
+rcwswppl  x1, x0, [x4]
+// CHECK-INST: rcwswppl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  1960a081      <unknown>
+
+rcwswpp   x5, x3, [sp]
+// CHECK-INST: rcwswpp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x23,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  1923a3e5      <unknown>
+
+rcwswppa  x5, x3, [sp]
+// CHECK-INST: rcwswppa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xa3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  19a3a3e5      <unknown>
+
+rcwswppal x5, x3, [sp]
+// CHECK-INST: rcwswppal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xe3,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  19e3a3e5      <unknown>
+
+rcwswppl  x5, x3, [sp]
+// CHECK-INST: rcwswppl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x63,0x19]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  1963a3e5      <unknown>
+
+rcwsswpp   x1, x0, [x4]
+// CHECK-INST: rcwsswpp x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  5920a081      <unknown>
+
+rcwsswppa  x1, x0, [x4]
+// CHECK-INST: rcwsswppa x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xa0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a0a081      <unknown>
+
+rcwsswppal x1, x0, [x4]
+// CHECK-INST: rcwsswppal x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0xe0,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e0a081      <unknown>
+
+rcwsswppl  x1, x0, [x4]
+// CHECK-INST: rcwsswppl x1, x0, [x4]
+// CHECK-ENCODING: encoding: [0x81,0xa0,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  5960a081      <unknown>
+
+rcwsswpp   x5, x3, [sp]
+// CHECK-INST: rcwsswpp x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x23,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  5923a3e5      <unknown>
+
+rcwsswppa  x5, x3, [sp]
+// CHECK-INST: rcwsswppa x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xa3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59a3a3e5      <unknown>
+
+rcwsswppal x5, x3, [sp]
+// CHECK-INST: rcwsswppal x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0xe3,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  59e3a3e5      <unknown>
+
+rcwsswppl  x5, x3, [sp]
+// CHECK-INST: rcwsswppl x5, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe5,0xa3,0x63,0x59]
+// CHECK-ERROR: error: instruction requires: d128 the
+// CHECK-UNKNOWN:  5963a3e5      <unknown>
diff --git a/llvm/test/MC/AArch64/armv9-mrrs-diagnostics.s b/llvm/test/MC/AArch64/armv9-mrrs-diagnostics.s
new file mode 100644
index 0000000..4eb8861
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9-mrrs-diagnostics.s
@@ -0,0 +1,30 @@
+// +the required for RCWSMASK_EL1, RCWMASK_EL1
+// +el2vmsa required for TTBR0_EL2 (VSCTLR_EL2), VTTBR_EL2
+// +vh required for TTBR1_EL2
+
+// RUN: not llvm-mc -triple=aarch64 -mattr=+d128,+the,+el2vmsa,+vh -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+
+mrrs x0, x2, TTBR0_EL1
+// CHECK-ERROR: error: expected second odd register of a consecutive same-size even/odd register pair
+
+mrrs x0, TTBR0_EL1
+// CHECK-ERROR: error: expected second odd register of a consecutive same-size even/odd register pair
+
+mrrs x1, x2, TTBR0_EL1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+mrrs x31, x0, TTBR0_EL1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+mrrs xzr, x30, TTBR0_EL1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+mrrs xzr, TTBR0_EL1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+mrrs S3_0_c2_c0_1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+mrrs S3_0_c2_c0_1, x0, x1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
diff --git a/llvm/test/MC/AArch64/armv9-mrrs.s b/llvm/test/MC/AArch64/armv9-mrrs.s
index 8701278..1fc7274 100644
--- a/llvm/test/MC/AArch64/armv9-mrrs.s
+++ b/llvm/test/MC/AArch64/armv9-mrrs.s
@@ -1,100 +1,282 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -mattr=+the,+el2vmsa,+vh -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | llvm-objdump -d --mattr=+d128,+the,+el2vmsa,+vh - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:   | llvm-objdump -d --mattr=-d128,+the,+el2vmsa,+vh - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+d128,+the,+el2vmsa,+vh -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 // +the required for RCWSMASK_EL1, RCWMASK_EL1
 // +el2vmsa required for TTBR0_EL2 (VSCTLR_EL2), VTTBR_EL2
 // +vh required for TTBR1_EL2
 
-// RUN: not llvm-mc -triple aarch64 -mattr=+d128,+the,+el2vmsa,+vh -show-encoding %s -o - 2> %t | FileCheck %s
-// RUN: FileCheck %s --input-file=%t --check-prefix=ERRORS
-
-// RUN: not llvm-mc -triple aarch64 -mattr=+the,+el2vmsa,+vh -show-encoding %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-          mrrs  x0, x1, TTBR0_EL1
-// CHECK: mrrs  x0, x1, TTBR0_EL1           // encoding: [0x00,0x20,0x78,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, TTBR1_EL1
-// CHECK: mrrs  x0, x1, TTBR1_EL1           // encoding: [0x20,0x20,0x78,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, PAR_EL1
-// CHECK: mrrs  x0, x1, PAR_EL1             // encoding: [0x00,0x74,0x78,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, RCWSMASK_EL1
-// CHECK: mrrs  x0, x1, RCWSMASK_EL1        // encoding: [0x60,0xd0,0x78,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, RCWMASK_EL1
-// CHECK: mrrs  x0, x1, RCWMASK_EL1         // encoding: [0xc0,0xd0,0x78,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, TTBR0_EL2
-// CHECK: mrrs  x0, x1, TTBR0_EL2           // encoding: [0x00,0x20,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, TTBR1_EL2
-// CHECK: mrrs  x0, x1, TTBR1_EL2           // encoding: [0x20,0x20,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x0, x1, VTTBR_EL2
-// CHECK: mrrs  x0, x1, VTTBR_EL2           // encoding: [0x00,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          mrrs   x0,  x1, VTTBR_EL2
-// CHECK: mrrs   x0,  x1, VTTBR_EL2           // encoding: [0x00,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs   x2,  x3, VTTBR_EL2
-// CHECK: mrrs   x2,  x3, VTTBR_EL2           // encoding: [0x02,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs   x4,  x5, VTTBR_EL2
-// CHECK: mrrs   x4,  x5, VTTBR_EL2           // encoding: [0x04,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs   x6,  x7, VTTBR_EL2
-// CHECK: mrrs   x6,  x7, VTTBR_EL2           // encoding: [0x06,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs   x8,  x9, VTTBR_EL2
-// CHECK: mrrs   x8,  x9, VTTBR_EL2           // encoding: [0x08,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x10, x11, VTTBR_EL2
-// CHECK: mrrs  x10, x11, VTTBR_EL2           // encoding: [0x0a,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x12, x13, VTTBR_EL2
-// CHECK: mrrs  x12, x13, VTTBR_EL2           // encoding: [0x0c,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x14, x15, VTTBR_EL2
-// CHECK: mrrs  x14, x15, VTTBR_EL2           // encoding: [0x0e,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x16, x17, VTTBR_EL2
-// CHECK: mrrs  x16, x17, VTTBR_EL2           // encoding: [0x10,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x18, x19, VTTBR_EL2
-// CHECK: mrrs  x18, x19, VTTBR_EL2           // encoding: [0x12,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x20, x21, VTTBR_EL2
-// CHECK: mrrs  x20, x21, VTTBR_EL2           // encoding: [0x14,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x22, x23, VTTBR_EL2
-// CHECK: mrrs  x22, x23, VTTBR_EL2           // encoding: [0x16,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x24, x25, VTTBR_EL2
-// CHECK: mrrs  x24, x25, VTTBR_EL2           // encoding: [0x18,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          mrrs  x26, x27, VTTBR_EL2
-// CHECK: mrrs  x26, x27, VTTBR_EL2           // encoding: [0x1a,0x21,0x7c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          mrrs x0, x2, TTBR0_EL1
-// ERRORS: error: expected second odd register of a consecutive same-size even/odd register pair
-
-          mrrs x0, TTBR0_EL1
-// ERRORS: error: expected second odd register of a consecutive same-size even/odd register pair
-
-          mrrs x1, x2, TTBR0_EL1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          mrrs x31, x0, TTBR0_EL1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          mrrs xzr, x30, TTBR0_EL1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          mrrs xzr, TTBR0_EL1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          mrrs S3_0_c2_c0_1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          mrrs S3_0_c2_c0_1, x0, x1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
+mrrs  x0, x1, TTBR0_EL1
+// CHECK-INST: mrrs x0, x1, TTBR0_EL1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x78,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5782000      <unknown>
+
+mrrs  x0, x1, TTBR1_EL1
+// CHECK-INST: mrrs x0, x1, TTBR1_EL1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x78,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5782020      <unknown>
+
+mrrs  x0, x1, PAR_EL1
+// CHECK-INST: mrrs x0, x1, PAR_EL1
+// CHECK-ENCODING: encoding: [0x00,0x74,0x78,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5787400      <unknown>
+
+mrrs  x0, x1, RCWSMASK_EL1
+// CHECK-INST: mrrs x0, x1, RCWSMASK_EL1
+// CHECK-ENCODING: encoding: [0x60,0xd0,0x78,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d578d060      <unknown>
+
+mrrs  x0, x1, RCWMASK_EL1
+// CHECK-INST: mrrs x0, x1, RCWMASK_EL1
+// CHECK-ENCODING: encoding: [0xc0,0xd0,0x78,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d578d0c0      <unknown>
+
+mrrs  x0, x1, TTBR0_EL2
+// CHECK-INST: mrrs x0, x1, TTBR0_EL2
+// CHECK-ENCODING: encoding: [0x00,0x20,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2000      <unknown>
+
+mrrs  x0, x1, TTBR1_EL2
+// CHECK-INST: mrrs x0, x1, TTBR1_EL2
+// CHECK-ENCODING: encoding: [0x20,0x20,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2020      <unknown>
+
+mrrs  x0, x1, VTTBR_EL2
+// CHECK-INST: mrrs x0, x1, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x00,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2100      <unknown>
+
+mrrs   x0,  x1, VTTBR_EL2
+// CHECK-INST: mrrs x0, x1, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x00,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2100      <unknown>
+
+mrrs   x2,  x3, VTTBR_EL2
+// CHECK-INST: mrrs x2, x3, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x02,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2102      <unknown>
+
+mrrs   x4,  x5, VTTBR_EL2
+// CHECK-INST: mrrs x4, x5, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x04,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2104      <unknown>
+
+mrrs   x6,  x7, VTTBR_EL2
+// CHECK-INST: mrrs x6, x7, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x06,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2106      <unknown>
+
+mrrs   x8,  x9, VTTBR_EL2
+// CHECK-INST: mrrs x8, x9, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x08,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2108      <unknown>
+
+mrrs  x10, x11, VTTBR_EL2
+// CHECK-INST: mrrs x10, x11, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x0a,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c210a      <unknown>
+
+mrrs  x12, x13, VTTBR_EL2
+// CHECK-INST: mrrs x12, x13, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x0c,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c210c      <unknown>
+
+mrrs  x14, x15, VTTBR_EL2
+// CHECK-INST: mrrs x14, x15, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x0e,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c210e      <unknown>
+
+mrrs  x16, x17, VTTBR_EL2
+// CHECK-INST: mrrs x16, x17, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x10,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2110      <unknown>
+
+mrrs  x18, x19, VTTBR_EL2
+// CHECK-INST: mrrs x18, x19, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x12,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2112      <unknown>
+
+mrrs  x20, x21, VTTBR_EL2
+// CHECK-INST: mrrs x20, x21, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x14,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2114      <unknown>
+
+mrrs  x22, x23, VTTBR_EL2
+// CHECK-INST: mrrs x22, x23, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x16,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2116      <unknown>
+
+mrrs  x24, x25, VTTBR_EL2
+// CHECK-INST: mrrs x24, x25, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x18,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c2118      <unknown>
+
+mrrs  x26, x27, VTTBR_EL2
+// CHECK-INST: mrrs x26, x27, VTTBR_EL2
+// CHECK-ENCODING: encoding: [0x1a,0x21,0x7c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d57c211a      <unknown>
+
+msrr  TTBR0_EL1, x0, x1
+// CHECK-INST: msrr TTBR0_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5582000      <unknown>
+
+msrr  TTBR1_EL1, x0, x1
+// CHECK-INST: msrr TTBR1_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5582020      <unknown>
+
+msrr  PAR_EL1, x0, x1
+// CHECK-INST: msrr PAR_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x74,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5587400      <unknown>
+
+msrr  RCWSMASK_EL1, x0, x1
+// CHECK-INST: msrr RCWSMASK_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x60,0xd0,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d558d060      <unknown>
+
+msrr  RCWMASK_EL1, x0, x1
+// CHECK-INST: msrr RCWMASK_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0xc0,0xd0,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d558d0c0      <unknown>
+
+msrr  TTBR0_EL2, x0, x1
+// CHECK-INST: msrr TTBR0_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2000      <unknown>
+
+msrr  TTBR1_EL2, x0, x1
+// CHECK-INST: msrr TTBR1_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2020      <unknown>
+
+msrr  VTTBR_EL2, x0, x1
+// CHECK-INST: msrr VTTBR_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2100      <unknown>
+
+msrr   VTTBR_EL2, x0, x1
+// CHECK-INST: msrr VTTBR_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2100      <unknown>
+
+msrr   VTTBR_EL2, x2, x3
+// CHECK-INST: msrr VTTBR_EL2, x2, x3
+// CHECK-ENCODING: encoding: [0x02,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2102      <unknown>
+
+msrr   VTTBR_EL2, x4, x5
+// CHECK-INST: msrr VTTBR_EL2, x4, x5
+// CHECK-ENCODING: encoding: [0x04,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2104      <unknown>
+
+msrr   VTTBR_EL2, x6, x7
+// CHECK-INST: msrr VTTBR_EL2, x6, x7
+// CHECK-ENCODING: encoding: [0x06,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2106      <unknown>
+
+msrr   VTTBR_EL2, x8, x9
+// CHECK-INST: msrr VTTBR_EL2, x8, x9
+// CHECK-ENCODING: encoding: [0x08,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2108      <unknown>
+
+msrr   VTTBR_EL2, x10, x11
+// CHECK-INST: msrr VTTBR_EL2, x10, x11
+// CHECK-ENCODING: encoding: [0x0a,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210a      <unknown>
+
+msrr   VTTBR_EL2, x12, x13
+// CHECK-INST: msrr VTTBR_EL2, x12, x13
+// CHECK-ENCODING: encoding: [0x0c,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210c      <unknown>
+
+msrr   VTTBR_EL2, x14, x15
+// CHECK-INST: msrr VTTBR_EL2, x14, x15
+// CHECK-ENCODING: encoding: [0x0e,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210e      <unknown>
+
+msrr   VTTBR_EL2, x16, x17
+// CHECK-INST: msrr VTTBR_EL2, x16, x17
+// CHECK-ENCODING: encoding: [0x10,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2110      <unknown>
+
+msrr   VTTBR_EL2, x18, x19
+// CHECK-INST: msrr VTTBR_EL2, x18, x19
+// CHECK-ENCODING: encoding: [0x12,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2112      <unknown>
+
+msrr   VTTBR_EL2, x20, x21
+// CHECK-INST: msrr VTTBR_EL2, x20, x21
+// CHECK-ENCODING: encoding: [0x14,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2114      <unknown>
+
+msrr   VTTBR_EL2, x22, x23
+// CHECK-INST: msrr VTTBR_EL2, x22, x23
+// CHECK-ENCODING: encoding: [0x16,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2116      <unknown>
+
+msrr   VTTBR_EL2, x24, x25
+// CHECK-INST: msrr VTTBR_EL2, x24, x25
+// CHECK-ENCODING: encoding: [0x18,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2118      <unknown>
+
+msrr   VTTBR_EL2, x26, x27
+// CHECK-INST: msrr VTTBR_EL2, x26, x27
+// CHECK-ENCODING: encoding: [0x1a,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c211a      <unknown>
diff --git a/llvm/test/MC/AArch64/armv9-msrr-diagnostics.s b/llvm/test/MC/AArch64/armv9-msrr-diagnostics.s
new file mode 100644
index 0000000..d49a3ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9-msrr-diagnostics.s
@@ -0,0 +1,30 @@
+// +the required for RCWSMASK_EL1, RCWMASK_EL1
+// +el2vmsa required for TTBR0_EL2 (VSCTLR_EL2), VTTBR_EL2
+// +vh required for TTBR1_EL2
+
+// RUN: not llvm-mc -triple=aarch64 -mattr=+d128,+the,+el2vmsa,+vh -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+
+msrr TTBR0_EL1, x0, x2
+// CHECK-ERROR: error: expected second odd register of a consecutive same-size even/odd register pair
+
+msrr TTBR0_EL1, x0
+// CHECK-ERROR: error: expected comma
+
+msrr TTBR0_EL1, x1, x2
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+msrr TTBR0_EL1, x31, x0
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+msrr TTBR0_EL1, xzr, x30
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+msrr TTBR0_EL1, xzr
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+
+msrr S3_0_c2_c0_1
+// CHECK-ERROR: error: too few operands for instruction
+
+msrr x0, x1, S3_0_c2_c0_1
+// CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
diff --git a/llvm/test/MC/AArch64/armv9-msrr.s b/llvm/test/MC/AArch64/armv9-msrr.s
index 2be17a7..439e0547 100644
--- a/llvm/test/MC/AArch64/armv9-msrr.s
+++ b/llvm/test/MC/AArch64/armv9-msrr.s
@@ -1,100 +1,150 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -mattr=+the,+el2vmsa,+vh -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | llvm-objdump -d --mattr=+d128,+the,+el2vmsa,+vh - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:   | llvm-objdump -d --mattr=-d128,+the,+el2vmsa,+vh - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+the,+el2vmsa,+vh < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+d128,+the,+el2vmsa,+vh -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 // +the required for RCWSMASK_EL1, RCWMASK_EL1
 // +el2vmsa required for TTBR0_EL2 (VSCTLR_EL2), VTTBR_EL2
 // +vh required for TTBR1_EL2
 
-// RUN: not llvm-mc -triple aarch64 -mattr=+d128,+the,+el2vmsa,+vh -show-encoding %s -o - 2> %t | FileCheck %s
-// RUN: FileCheck %s --input-file=%t --check-prefix=ERRORS
-
-// RUN: not llvm-mc -triple aarch64 -mattr=+the,+el2vmsa,+vh -show-encoding %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-          msrr  TTBR0_EL1, x0, x1
-// CHECK: msrr  TTBR0_EL1, x0, x1           // encoding: [0x00,0x20,0x58,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  TTBR1_EL1, x0, x1
-// CHECK: msrr  TTBR1_EL1, x0, x1           // encoding: [0x20,0x20,0x58,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  PAR_EL1, x0, x1
-// CHECK: msrr  PAR_EL1, x0, x1             // encoding: [0x00,0x74,0x58,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  RCWSMASK_EL1, x0, x1
-// CHECK: msrr  RCWSMASK_EL1, x0, x1        // encoding: [0x60,0xd0,0x58,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  RCWMASK_EL1, x0, x1
-// CHECK: msrr  RCWMASK_EL1, x0, x1         // encoding: [0xc0,0xd0,0x58,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  TTBR0_EL2, x0, x1
-// CHECK: msrr  TTBR0_EL2, x0, x1           // encoding: [0x00,0x20,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  TTBR1_EL2, x0, x1
-// CHECK: msrr  TTBR1_EL2, x0, x1           // encoding: [0x20,0x20,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr  VTTBR_EL2, x0, x1
-// CHECK: msrr  VTTBR_EL2, x0, x1           // encoding: [0x00,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          msrr   VTTBR_EL2, x0, x1
-// CHECK: msrr   VTTBR_EL2, x0, x1           // encoding: [0x00,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x2, x3
-// CHECK: msrr   VTTBR_EL2, x2, x3           // encoding: [0x02,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x4, x5
-// CHECK: msrr   VTTBR_EL2, x4, x5           // encoding: [0x04,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x6, x7
-// CHECK: msrr   VTTBR_EL2, x6, x7           // encoding: [0x06,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x8, x9
-// CHECK: msrr   VTTBR_EL2, x8, x9           // encoding: [0x08,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x10, x11
-// CHECK: msrr   VTTBR_EL2, x10, x11           // encoding: [0x0a,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x12, x13
-// CHECK: msrr   VTTBR_EL2, x12, x13           // encoding: [0x0c,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x14, x15
-// CHECK: msrr   VTTBR_EL2, x14, x15           // encoding: [0x0e,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x16, x17
-// CHECK: msrr   VTTBR_EL2, x16, x17           // encoding: [0x10,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x18, x19
-// CHECK: msrr   VTTBR_EL2, x18, x19           // encoding: [0x12,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x20, x21
-// CHECK: msrr   VTTBR_EL2, x20, x21           // encoding: [0x14,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x22, x23
-// CHECK: msrr   VTTBR_EL2, x22, x23           // encoding: [0x16,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x24, x25
-// CHECK: msrr   VTTBR_EL2, x24, x25           // encoding: [0x18,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          msrr   VTTBR_EL2, x26, x27
-// CHECK: msrr   VTTBR_EL2, x26, x27           // encoding: [0x1a,0x21,0x5c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          msrr TTBR0_EL1, x0, x2
-// ERRORS: error: expected second odd register of a consecutive same-size even/odd register pair
-
-          msrr TTBR0_EL1, x0
-// ERRORS: error: expected comma
-
-          msrr TTBR0_EL1, x1, x2
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          msrr TTBR0_EL1, x31, x0
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          msrr TTBR0_EL1, xzr, x30
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          msrr TTBR0_EL1, xzr
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          msrr S3_0_c2_c0_1
-// ERRORS: error: too few operands for instruction
-
-          msrr x0, x1, S3_0_c2_c0_1
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
+msrr  TTBR0_EL1, x0, x1
+// CHECK-INST: msrr TTBR0_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5582000      <unknown>
+
+msrr  TTBR1_EL1, x0, x1
+// CHECK-INST: msrr TTBR1_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5582020      <unknown>
+
+msrr  PAR_EL1, x0, x1
+// CHECK-INST: msrr PAR_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x74,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5587400      <unknown>
+
+msrr  RCWSMASK_EL1, x0, x1
+// CHECK-INST: msrr RCWSMASK_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0x60,0xd0,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d558d060      <unknown>
+
+msrr  RCWMASK_EL1, x0, x1
+// CHECK-INST: msrr RCWMASK_EL1, x0, x1
+// CHECK-ENCODING: encoding: [0xc0,0xd0,0x58,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d558d0c0      <unknown>
+
+msrr  TTBR0_EL2, x0, x1
+// CHECK-INST: msrr TTBR0_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2000      <unknown>
+
+msrr  TTBR1_EL2, x0, x1
+// CHECK-INST: msrr TTBR1_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2020      <unknown>
+
+msrr  VTTBR_EL2, x0, x1
+// CHECK-INST: msrr VTTBR_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2100      <unknown>
+
+msrr   VTTBR_EL2, x0, x1
+// CHECK-INST: msrr VTTBR_EL2, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2100      <unknown>
+
+msrr   VTTBR_EL2, x2, x3
+// CHECK-INST: msrr VTTBR_EL2, x2, x3
+// CHECK-ENCODING: encoding: [0x02,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2102      <unknown>
+
+msrr   VTTBR_EL2, x4, x5
+// CHECK-INST: msrr VTTBR_EL2, x4, x5
+// CHECK-ENCODING: encoding: [0x04,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2104      <unknown>
+
+msrr   VTTBR_EL2, x6, x7
+// CHECK-INST: msrr VTTBR_EL2, x6, x7
+// CHECK-ENCODING: encoding: [0x06,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2106      <unknown>
+
+msrr   VTTBR_EL2, x8, x9
+// CHECK-INST: msrr VTTBR_EL2, x8, x9
+// CHECK-ENCODING: encoding: [0x08,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2108      <unknown>
+
+msrr   VTTBR_EL2, x10, x11
+// CHECK-INST: msrr VTTBR_EL2, x10, x11
+// CHECK-ENCODING: encoding: [0x0a,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210a      <unknown>
+
+msrr   VTTBR_EL2, x12, x13
+// CHECK-INST: msrr VTTBR_EL2, x12, x13
+// CHECK-ENCODING: encoding: [0x0c,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210c      <unknown>
+
+msrr   VTTBR_EL2, x14, x15
+// CHECK-INST: msrr VTTBR_EL2, x14, x15
+// CHECK-ENCODING: encoding: [0x0e,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c210e      <unknown>
+
+msrr   VTTBR_EL2, x16, x17
+// CHECK-INST: msrr VTTBR_EL2, x16, x17
+// CHECK-ENCODING: encoding: [0x10,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2110      <unknown>
+
+msrr   VTTBR_EL2, x18, x19
+// CHECK-INST: msrr VTTBR_EL2, x18, x19
+// CHECK-ENCODING: encoding: [0x12,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2112      <unknown>
+
+msrr   VTTBR_EL2, x20, x21
+// CHECK-INST: msrr VTTBR_EL2, x20, x21
+// CHECK-ENCODING: encoding: [0x14,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2114      <unknown>
+
+msrr   VTTBR_EL2, x22, x23
+// CHECK-INST: msrr VTTBR_EL2, x22, x23
+// CHECK-ENCODING: encoding: [0x16,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2116      <unknown>
+
+msrr   VTTBR_EL2, x24, x25
+// CHECK-INST: msrr VTTBR_EL2, x24, x25
+// CHECK-ENCODING: encoding: [0x18,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c2118      <unknown>
+
+msrr   VTTBR_EL2, x26, x27
+// CHECK-INST: msrr VTTBR_EL2, x26, x27
+// CHECK-ENCODING: encoding: [0x1a,0x21,0x5c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: d128
+// CHECK-UNKNOWN:  d55c211a      <unknown>
diff --git a/llvm/test/MC/AArch64/armv9-sysp-diagnostics.s b/llvm/test/MC/AArch64/armv9-sysp-diagnostics.s
new file mode 100644
index 0000000..8b466c1
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9-sysp-diagnostics.s
@@ -0,0 +1,35 @@
+// +tbl-rmi required for RIPA*/RVA*
+// +xs required for *NXS
+
+// RUN: not llvm-mc -triple aarch64 -mattr=+d128,+tlb-rmi,+xs -show-encoding %s -o - 2>&1 | FileCheck %s --check-prefix=ERRORS
+
+// sysp #<op1>, <Cn>, <Cm>, #<op2>{, <Xt1>, <Xt2>}
+// registers with 128-bit formats (op0, op1, Cn, Cm, op2)
+// For sysp, op0 is 0
+
+sysp #0, c2, c0, #0, x0, x2
+// ERRORS: error: expected second odd register of a consecutive same-size even/odd register pair
+sysp #0, c2, c0, #0, x0
+// ERRORS: error: expected comma
+sysp #0, c2, c0, #0, x1, x2
+// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
+sysp #0, c2, c0, #0, x31, x0
+// ERRORS: error: xzr must be followed by xzr
+sysp #0, c2, c0, #0, xzr, x30
+// ERRORS: error: xzr must be followed by xzr
+sysp #0, c2, c0, #0, xzr
+// ERRORS: error: expected comma
+sysp #0, c2, c0, #0, xzr,
+// ERRORS: error: expected register operand
+
+
+tlbip RVAE3IS
+// ERRORS: error: expected comma
+tlbip RVAE3IS,
+// ERRORS: error: expected register identifier
+tlbip VAE3,
+// ERRORS: error: expected register identifier
+tlbip IPAS2E1, x4, x8
+// ERRORS: error: specified tlbip op requires a pair of registers
+tlbip RVAE3, x11, x11
+// ERRORS: error: specified tlbip op requires a pair of registers
diff --git a/llvm/test/MC/AArch64/armv9-sysp.s b/llvm/test/MC/AArch64/armv9-sysp.s
deleted file mode 100644
index 908e880..0000000
--- a/llvm/test/MC/AArch64/armv9-sysp.s
+++ /dev/null
@@ -1,538 +0,0 @@
-// +tbl-rmi required for RIPA*/RVA*
-// +xs required for *NXS
-
-// RUN: not llvm-mc -triple aarch64 -mattr=+d128,+tlb-rmi,+xs -show-encoding %s -o - 2> %t | FileCheck %s
-// RUN: FileCheck %s --input-file=%t --check-prefix=ERRORS
-
-// RUN: not llvm-mc -triple aarch64 -mattr=+tlb-rmi,+xs -show-encoding %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-// sysp #<op1>, <Cn>, <Cm>, #<op2>{, <Xt1>, <Xt2>}
-// registers with 128-bit formats (op0, op1, Cn, Cm, op2)
-// For sysp, op0 is 0
-
-          sysp #0, c2, c0, #0, x0, x1          // TTBR0_EL1     3  0  2  0  0
-// CHECK: sysp #0, c2, c0, #0, x0, x1          // encoding: [0x00,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #1, x0, x1          // TTBR1_EL1     3  0  2  0  1
-// CHECK: sysp #0, c2, c0, #1, x0, x1          // encoding: [0x20,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c7, c4, #0, x0, x1          // PAR_EL1       3  0  7  4  0
-// CHECK: sysp #0, c7, c4, #0, x0, x1          // encoding: [0x00,0x74,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c13, c0, #3, x0, x1         // RCWSMASK_EL1  3  0 13  0  3
-// CHECK: sysp #0, c13, c0, #3, x0, x1         // encoding: [0x60,0xd0,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c13, c0, #6, x0, x1         // RCWMASK_EL1   3  0 13  0  6
-// CHECK: sysp #0, c13, c0, #6, x0, x1         // encoding: [0xc0,0xd0,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c0, #0, x0, x1          // TTBR0_EL2     3  4  2  0  0
-// CHECK: sysp #4, c2, c0, #0, x0, x1          // encoding: [0x00,0x20,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c0, #1, x0, x1          // TTBR1_EL2     3  4  2  0  1
-// CHECK: sysp #4, c2, c0, #1, x0, x1          // encoding: [0x20,0x20,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c1, #0, x0, x1          // VTTBR_EL2     3  4  2  1  0
-// CHECK: sysp #4, c2, c1, #0, x0, x1          // encoding: [0x00,0x21,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-
-          sysp #0, c2, c0, #0, x0, x1
-// CHECK: sysp #0, c2, c0, #0, x0, x1          // encoding: [0x00,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #1, x0, x1
-// CHECK: sysp #0, c2, c0, #1, x0, x1          // encoding: [0x20,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c7, c4, #0, x0, x1
-// CHECK: sysp #0, c7, c4, #0, x0, x1          // encoding: [0x00,0x74,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c13, c0, #3, x0, x1
-// CHECK: sysp #0, c13, c0, #3, x0, x1         // encoding: [0x60,0xd0,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c13, c0, #6, x0, x1
-// CHECK: sysp #0, c13, c0, #6, x0, x1         // encoding: [0xc0,0xd0,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c0, #0, x0, x1
-// CHECK: sysp #4, c2, c0, #0, x0, x1          // encoding: [0x00,0x20,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c0, #1, x0, x1
-// CHECK: sysp #4, c2, c0, #1, x0, x1          // encoding: [0x20,0x20,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #4, c2, c1, #0, x0, x1
-// CHECK: sysp #4, c2, c1, #0, x0, x1          // encoding: [0x00,0x21,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          sysp #0, c2, c0, #0, x0, x1
-// CHECK: sysp #0, c2, c0, #0, x0, x1          // encoding: [0x00,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x2, x3
-// CHECK: sysp #0, c2, c0, #0, x2, x3          // encoding: [0x02,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x4, x5
-// CHECK: sysp #0, c2, c0, #0, x4, x5          // encoding: [0x04,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x6, x7
-// CHECK: sysp #0, c2, c0, #0, x6, x7          // encoding: [0x06,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x8, x9
-// CHECK: sysp #0, c2, c0, #0, x8, x9          // encoding: [0x08,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x10, x11
-// CHECK: sysp #0, c2, c0, #0, x10, x11        // encoding: [0x0a,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x12, x13
-// CHECK: sysp #0, c2, c0, #0, x12, x13        // encoding: [0x0c,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x14, x15
-// CHECK: sysp #0, c2, c0, #0, x14, x15        // encoding: [0x0e,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x16, x17
-// CHECK: sysp #0, c2, c0, #0, x16, x17        // encoding: [0x10,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x18, x19
-// CHECK: sysp #0, c2, c0, #0, x18, x19        // encoding: [0x12,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x20, x21
-// CHECK: sysp #0, c2, c0, #0, x20, x21        // encoding: [0x14,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x22, x23
-// CHECK: sysp #0, c2, c0, #0, x22, x23        // encoding: [0x16,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x24, x25
-// CHECK: sysp #0, c2, c0, #0, x24, x25        // encoding: [0x18,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x26, x27
-// CHECK: sysp #0, c2, c0, #0, x26, x27        // encoding: [0x1a,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x28, x29
-// CHECK: sysp #0, c2, c0, #0, x28, x29        // encoding: [0x1c,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x30, x31
-// CHECK: sysp #0, c2, c0, #0, x30, xzr        // encoding: [0x1e,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          sysp #0, c2, c0, #0, x31, x31
-// CHECK: sysp #0, c2, c0, #0                  // encoding: [0x1f,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, xzr, xzr
-// CHECK: sysp #0, c2, c0, #0                  // encoding: [0x1f,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, x31, xzr
-// CHECK: sysp #0, c2, c0, #0                  // encoding: [0x1f,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0, xzr, x31
-// CHECK: sysp #0, c2, c0, #0                  // encoding: [0x1f,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          sysp #0, c2, c0, #0
-// CHECK: sysp #0, c2, c0, #0                  // encoding: [0x1f,0x20,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-
-          sysp #0, c2, c0, #0, x0, x2
-// ERRORS: error: expected second odd register of a consecutive same-size even/odd register pair
-
-          sysp #0, c2, c0, #0, x0
-// ERRORS: error: expected comma
-
-          sysp #0, c2, c0, #0, x1, x2
-// ERRORS: error: expected first even register of a consecutive same-size even/odd register pair
-
-          sysp #0, c2, c0, #0, x31, x0
-// ERRORS: error: xzr must be followed by xzr
-
-          sysp #0, c2, c0, #0, xzr, x30
-// ERRORS: error: xzr must be followed by xzr
-
-          sysp #0, c2, c0, #0, xzr
-// ERRORS: error: expected comma
-
-          sysp #0, c2, c0, #0, xzr,
-// ERRORS: error: expected register operand
-
-
-          tlbip IPAS2E1, x4, x5
-// CHECK: tlbip ipas2e1, x4, x5                 // encoding: [0x24,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2E1NXS, x4, x5
-// CHECK: tlbip ipas2e1nxs, x4, x5              // encoding: [0x24,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2E1IS, x4, x5
-// CHECK: tlbip ipas2e1is, x4, x5               // encoding: [0x24,0x80,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2E1ISNXS, x4, x5
-// CHECK: tlbip ipas2e1isnxs, x4, x5            // encoding: [0x24,0x90,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2E1OS, x4, x5
-// CHECK: tlbip ipas2e1os, x4, x5               // encoding: [0x04,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2E1OSNXS, x4, x5
-// CHECK: tlbip ipas2e1osnxs, x4, x5            // encoding: [0x04,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1, x4, x5
-// CHECK: tlbip ipas2le1, x4, x5                // encoding: [0xa4,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1NXS, x4, x5
-// CHECK: tlbip ipas2le1nxs, x4, x5             // encoding: [0xa4,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1IS, x4, x5
-// CHECK: tlbip ipas2le1is, x4, x5              // encoding: [0xa4,0x80,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1ISNXS, x4, x5
-// CHECK: tlbip ipas2le1isnxs, x4, x5           // encoding: [0xa4,0x90,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1OS, x4, x5
-// CHECK: tlbip ipas2le1os, x4, x5              // encoding: [0x84,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip IPAS2LE1OSNXS, x4, x5
-// CHECK: tlbip ipas2le1osnxs, x4, x5           // encoding: [0x84,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-
-          tlbip VAE1, x8, x9
-// CHECK: tlbip vae1, x8, x9                    // encoding: [0x28,0x87,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE1NXS, x8, x9
-// CHECK: tlbip vae1nxs, x8, x9                 // encoding: [0x28,0x97,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE1IS, x8, x9
-// CHECK: tlbip vae1is, x8, x9                  // encoding: [0x28,0x83,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE1ISNXS, x8, x9
-// CHECK: tlbip vae1isnxs, x8, x9               // encoding: [0x28,0x93,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE1OS, x8, x9
-// CHECK: tlbip vae1os, x8, x9                  // encoding: [0x28,0x81,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE1OSNXS, x8, x9
-// CHECK: tlbip vae1osnxs, x8, x9               // encoding: [0x28,0x91,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1, x8, x9
-// CHECK: tlbip vale1, x8, x9                   // encoding: [0xa8,0x87,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1NXS, x8, x9
-// CHECK: tlbip vale1nxs, x8, x9                // encoding: [0xa8,0x97,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1IS, x8, x9
-// CHECK: tlbip vale1is, x8, x9                 // encoding: [0xa8,0x83,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1ISNXS, x8, x9
-// CHECK: tlbip vale1isnxs, x8, x9              // encoding: [0xa8,0x93,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1OS, x8, x9
-// CHECK: tlbip vale1os, x8, x9                 // encoding: [0xa8,0x81,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE1OSNXS, x8, x9
-// CHECK: tlbip vale1osnxs, x8, x9              // encoding: [0xa8,0x91,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1, x8, x9
-// CHECK: tlbip vaae1, x8, x9                   // encoding: [0x68,0x87,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1NXS, x8, x9
-// CHECK: tlbip vaae1nxs, x8, x9                // encoding: [0x68,0x97,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1IS, x8, x9
-// CHECK: tlbip vaae1is, x8, x9                 // encoding: [0x68,0x83,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1ISNXS, x8, x9
-// CHECK: tlbip vaae1isnxs, x8, x9              // encoding: [0x68,0x93,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1OS, x8, x9
-// CHECK: tlbip vaae1os, x8, x9                 // encoding: [0x68,0x81,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAAE1OSNXS, x8, x9
-// CHECK: tlbip vaae1osnxs, x8, x9              // encoding: [0x68,0x91,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1, x8, x9
-// CHECK: tlbip vaale1, x8, x9                  // encoding: [0xe8,0x87,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1NXS, x8, x9
-// CHECK: tlbip vaale1nxs, x8, x9               // encoding: [0xe8,0x97,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1IS, x8, x9
-// CHECK: tlbip vaale1is, x8, x9                // encoding: [0xe8,0x83,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1ISNXS, x8, x9
-// CHECK: tlbip vaale1isnxs, x8, x9             // encoding: [0xe8,0x93,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1OS, x8, x9
-// CHECK: tlbip vaale1os, x8, x9                // encoding: [0xe8,0x81,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAALE1OSNXS, x8, x9
-// CHECK: tlbip vaale1osnxs, x8, x9             // encoding: [0xe8,0x91,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          tlbip VAE2, x14, x15
-// CHECK: tlbip vae2, x14, x15                    // encoding: [0x2e,0x87,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE2NXS, x14, x15
-// CHECK: tlbip vae2nxs, x14, x15                 // encoding: [0x2e,0x97,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE2IS, x14, x15
-// CHECK: tlbip vae2is, x14, x15                  // encoding: [0x2e,0x83,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE2ISNXS, x14, x15
-// CHECK: tlbip vae2isnxs, x14, x15               // encoding: [0x2e,0x93,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE2OS, x14, x15
-// CHECK: tlbip vae2os, x14, x15                  // encoding: [0x2e,0x81,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE2OSNXS, x14, x15
-// CHECK: tlbip vae2osnxs, x14, x15               // encoding: [0x2e,0x91,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2, x14, x15
-// CHECK: tlbip vale2, x14, x15                   // encoding: [0xae,0x87,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2NXS, x14, x15
-// CHECK: tlbip vale2nxs, x14, x15                // encoding: [0xae,0x97,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2IS, x14, x15
-// CHECK: tlbip vale2is, x14, x15                 // encoding: [0xae,0x83,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2ISNXS, x14, x15
-// CHECK: tlbip vale2isnxs, x14, x15              // encoding: [0xae,0x93,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2OS, x14, x15
-// CHECK: tlbip vale2os, x14, x15                 // encoding: [0xae,0x81,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE2OSNXS, x14, x15
-// CHECK: tlbip vale2osnxs, x14, x15              // encoding: [0xae,0x91,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          tlbip VAE3, x24, x25
-// CHECK: tlbip vae3, x24, x25                    // encoding: [0x38,0x87,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE3NXS, x24, x25
-// CHECK: tlbip vae3nxs, x24, x25                 // encoding: [0x38,0x97,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE3IS, x24, x25
-// CHECK: tlbip vae3is, x24, x25                  // encoding: [0x38,0x83,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE3ISNXS, x24, x25
-// CHECK: tlbip vae3isnxs, x24, x25               // encoding: [0x38,0x93,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE3OS, x24, x25
-// CHECK: tlbip vae3os, x24, x25                  // encoding: [0x38,0x81,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VAE3OSNXS, x24, x25
-// CHECK: tlbip vae3osnxs, x24, x25               // encoding: [0x38,0x91,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3, x24, x25
-// CHECK: tlbip vale3, x24, x25                   // encoding: [0xb8,0x87,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3NXS, x24, x25
-// CHECK: tlbip vale3nxs, x24, x25                // encoding: [0xb8,0x97,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3IS, x24, x25
-// CHECK: tlbip vale3is, x24, x25                 // encoding: [0xb8,0x83,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3ISNXS, x24, x25
-// CHECK: tlbip vale3isnxs, x24, x25              // encoding: [0xb8,0x93,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3OS, x24, x25
-// CHECK: tlbip vale3os, x24, x25                 // encoding: [0xb8,0x81,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip VALE3OSNXS, x24, x25
-// CHECK: tlbip vale3osnxs, x24, x25              // encoding: [0xb8,0x91,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-
-          tlbip RVAE1, x18, x19
-// CHECK: tlbip rvae1, x18, x19                   // encoding: [0x32,0x86,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE1NXS, x18, x19
-// CHECK: tlbip rvae1nxs, x18, x19                // encoding: [0x32,0x96,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE1IS, x18, x19
-// CHECK: tlbip rvae1is, x18, x19                 // encoding: [0x32,0x82,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE1ISNXS, x18, x19
-// CHECK: tlbip rvae1isnxs, x18, x19              // encoding: [0x32,0x92,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE1OS, x18, x19
-// CHECK: tlbip rvae1os, x18, x19                 // encoding: [0x32,0x85,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE1OSNXS, x18, x19
-// CHECK: tlbip rvae1osnxs, x18, x19              // encoding: [0x32,0x95,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1, x18, x19
-// CHECK: tlbip rvaae1, x18, x19                  // encoding: [0x72,0x86,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1NXS, x18, x19
-// CHECK: tlbip rvaae1nxs, x18, x19               // encoding: [0x72,0x96,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1IS, x18, x19
-// CHECK: tlbip rvaae1is, x18, x19                // encoding: [0x72,0x82,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1ISNXS, x18, x19
-// CHECK: tlbip rvaae1isnxs, x18, x19             // encoding: [0x72,0x92,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1OS, x18, x19
-// CHECK: tlbip rvaae1os, x18, x19                // encoding: [0x72,0x85,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAAE1OSNXS, x18, x19
-// CHECK: tlbip rvaae1osnxs, x18, x19             // encoding: [0x72,0x95,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1, x18, x19
-// CHECK: tlbip rvale1, x18, x19                  // encoding: [0xb2,0x86,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1NXS, x18, x19
-// CHECK: tlbip rvale1nxs, x18, x19               // encoding: [0xb2,0x96,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1IS, x18, x19
-// CHECK: tlbip rvale1is, x18, x19                // encoding: [0xb2,0x82,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1ISNXS, x18, x19
-// CHECK: tlbip rvale1isnxs, x18, x19             // encoding: [0xb2,0x92,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1OS, x18, x19
-// CHECK: tlbip rvale1os, x18, x19                // encoding: [0xb2,0x85,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE1OSNXS, x18, x19
-// CHECK: tlbip rvale1osnxs, x18, x19             // encoding: [0xb2,0x95,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1, x18, x19
-// CHECK: tlbip rvaale1, x18, x19                 // encoding: [0xf2,0x86,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1NXS, x18, x19
-// CHECK: tlbip rvaale1nxs, x18, x19              // encoding: [0xf2,0x96,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1IS, x18, x19
-// CHECK: tlbip rvaale1is, x18, x19               // encoding: [0xf2,0x82,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1ISNXS, x18, x19
-// CHECK: tlbip rvaale1isnxs, x18, x19            // encoding: [0xf2,0x92,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1OS, x18, x19
-// CHECK: tlbip rvaale1os, x18, x19               // encoding: [0xf2,0x85,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAALE1OSNXS, x18, x19
-// CHECK: tlbip rvaale1osnxs, x18, x19            // encoding: [0xf2,0x95,0x48,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          tlbip RVAE2, x28, x29
-// CHECK: tlbip rvae2, x28, x29                   // encoding: [0x3c,0x86,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE2NXS, x28, x29
-// CHECK: tlbip rvae2nxs, x28, x29                // encoding: [0x3c,0x96,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE2IS, x28, x29
-// CHECK: tlbip rvae2is, x28, x29                 // encoding: [0x3c,0x82,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE2ISNXS, x28, x29
-// CHECK: tlbip rvae2isnxs, x28, x29              // encoding: [0x3c,0x92,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE2OS, x28, x29
-// CHECK: tlbip rvae2os, x28, x29                 // encoding: [0x3c,0x85,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE2OSNXS, x28, x29
-// CHECK: tlbip rvae2osnxs, x28, x29              // encoding: [0x3c,0x95,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2, x28, x29
-// CHECK: tlbip rvale2, x28, x29                  // encoding: [0xbc,0x86,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2NXS, x28, x29
-// CHECK: tlbip rvale2nxs, x28, x29               // encoding: [0xbc,0x96,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2IS, x28, x29
-// CHECK: tlbip rvale2is, x28, x29                // encoding: [0xbc,0x82,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2ISNXS, x28, x29
-// CHECK: tlbip rvale2isnxs, x28, x29             // encoding: [0xbc,0x92,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2OS, x28, x29
-// CHECK: tlbip rvale2os, x28, x29                // encoding: [0xbc,0x85,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE2OSNXS, x28, x29
-// CHECK: tlbip rvale2osnxs, x28, x29             // encoding: [0xbc,0x95,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          tlbip RVAE3, x10, x11
-// CHECK: tlbip rvae3, x10, x11                   // encoding: [0x2a,0x86,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3NXS, x10, x11
-// CHECK: tlbip rvae3nxs, x10, x11                // encoding: [0x2a,0x96,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3IS, x10, x11
-// CHECK: tlbip rvae3is, x10, x11                 // encoding: [0x2a,0x82,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3ISNXS, x10, x11
-// CHECK: tlbip rvae3isnxs, x10, x11              // encoding: [0x2a,0x92,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3OS, x10, x11
-// CHECK: tlbip rvae3os, x10, x11                 // encoding: [0x2a,0x85,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3OSNXS, x10, x11
-// CHECK: tlbip rvae3osnxs, x10, x11              // encoding: [0x2a,0x95,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3, x10, x11
-// CHECK: tlbip rvale3, x10, x11                  // encoding: [0xaa,0x86,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3NXS, x10, x11
-// CHECK: tlbip rvale3nxs, x10, x11               // encoding: [0xaa,0x96,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3IS, x10, x11
-// CHECK: tlbip rvale3is, x10, x11                // encoding: [0xaa,0x82,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3ISNXS, x10, x11
-// CHECK: tlbip rvale3isnxs, x10, x11             // encoding: [0xaa,0x92,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3OS, x10, x11
-// CHECK: tlbip rvale3os, x10, x11                // encoding: [0xaa,0x85,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVALE3OSNXS, x10, x11
-// CHECK: tlbip rvale3osnxs, x10, x11             // encoding: [0xaa,0x95,0x4e,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-
-          tlbip RIPAS2E1, x20, x21
-// CHECK: tlbip ripas2e1, x20, x21                // encoding: [0x54,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2E1NXS, x20, x21
-// CHECK: tlbip ripas2e1nxs, x20, x21             // encoding: [0x54,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2E1IS, x20, x21
-// CHECK: tlbip ripas2e1is, x20, x21              // encoding: [0x54,0x80,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2E1ISNXS, x20, x21
-// CHECK: tlbip ripas2e1isnxs, x20, x21           // encoding: [0x54,0x90,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2E1OS, x20, x21
-// CHECK: tlbip ripas2e1os, x20, x21              // encoding: [0x74,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2E1OSNXS, x20, x21
-// CHECK: tlbip ripas2e1osnxs, x20, x21           // encoding: [0x74,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1, x20, x21
-// CHECK: tlbip ripas2le1, x20, x21               // encoding: [0xd4,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1NXS, x20, x21
-// CHECK: tlbip ripas2le1nxs, x20, x21            // encoding: [0xd4,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1IS, x20, x21
-// CHECK: tlbip ripas2le1is, x20, x21             // encoding: [0xd4,0x80,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1ISNXS, x20, x21
-// CHECK: tlbip ripas2le1isnxs, x20, x21          // encoding: [0xd4,0x90,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1OS, x20, x21
-// CHECK: tlbip ripas2le1os, x20, x21             // encoding: [0xf4,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1OSNXS, x20, x21
-// CHECK: tlbip ripas2le1osnxs, x20, x21          // encoding: [0xf4,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-
-          tlbip RIPAS2LE1OS, xzr, xzr
-// CHECK: tlbip ripas2le1os, xzr, xzr             // encoding: [0xff,0x84,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RIPAS2LE1OSNXS, xzr, xzr
-// CHECK: tlbip ripas2le1osnxs, xzr, xzr          // encoding: [0xff,0x94,0x4c,0xd5]
-// ERROR-NO-D128: [[@LINE-2]]:11: error: instruction requires: d128
-          tlbip RVAE3IS
-// ERRORS: error: expected comma
-          tlbip RVAE3IS,
-// ERRORS: error: expected register identifier
-          tlbip VAE3,
-// ERRORS: error: expected register identifier
-          tlbip IPAS2E1, x4, x8
-// ERRORS: error: specified tlbip op requires a pair of registers
-          tlbip RVAE3, x11, x11
-// ERRORS: error: specified tlbip op requires a pair of registers
diff --git a/llvm/test/MC/AArch64/armv9.2a-mec.s b/llvm/test/MC/AArch64/armv9.2a-mec.s
index 42e4bf7..1998b43 100644
--- a/llvm/test/MC/AArch64/armv9.2a-mec.s
+++ b/llvm/test/MC/AArch64/armv9.2a-mec.s
@@ -1,55 +1,129 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+mec < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-MEC %s
-
-          mrs x0, MECIDR_EL2
-// CHECK: mrs   x0, MECIDR_EL2       // encoding: [0xe0,0xa8,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, MECID_P0_EL2
-// CHECK: mrs   x0, MECID_P0_EL2      // encoding: [0x00,0xa8,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, MECID_A0_EL2
-// CHECK: mrs   x0, MECID_A0_EL2      // encoding: [0x20,0xa8,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, MECID_P1_EL2
-// CHECK: mrs   x0, MECID_P1_EL2      // encoding: [0x40,0xa8,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, MECID_A1_EL2
-// CHECK: mrs   x0, MECID_A1_EL2      // encoding: [0x60,0xa8,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, VMECID_P_EL2
-// CHECK: mrs   x0, VMECID_P_EL2     // encoding: [0x00,0xa9,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, VMECID_A_EL2
-// CHECK: mrs   x0, VMECID_A_EL2     // encoding: [0x20,0xa9,0x3c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          mrs x0, MECID_RL_A_EL3
-// CHECK: mrs   x0, MECID_RL_A_EL3   // encoding: [0x20,0xaa,0x3e,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:19: error: expected readable system register
-          msr MECID_P0_EL2,    x0
-// CHECK: msr   MECID_P0_EL2, x0      // encoding: [0x00,0xa8,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr MECID_A0_EL2,    x0
-// CHECK: msr   MECID_A0_EL2, x0      // encoding: [0x20,0xa8,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr MECID_P1_EL2,    x0
-// CHECK: msr   MECID_P1_EL2, x0      // encoding: [0x40,0xa8,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr MECID_A1_EL2,    x0
-// CHECK: msr   MECID_A1_EL2, x0      // encoding: [0x60,0xa8,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr VMECID_P_EL2,   x0
-// CHECK: msr   VMECID_P_EL2, x0     // encoding: [0x00,0xa9,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr VMECID_A_EL2,   x0
-// CHECK: msr   VMECID_A_EL2, x0     // encoding: [0x20,0xa9,0x1c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-          msr MECID_RL_A_EL3, x0
-// CHECK: msr   MECID_RL_A_EL3, x0   // encoding: [0x20,0xaa,0x1e,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:15: error: expected writable system register or pstate
-
-          dc cigdpae, x0
-// CHECK: dc cigdpae, x0             // encoding: [0xe0,0x7e,0x0c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:14: error: DC CIGDPAE requires: mec
-          dc cipae, x0
-// CHECK: dc cipae, x0               // encoding: [0x00,0x7e,0x0c,0xd5]
-// CHECK-NO-MEC: [[@LINE-2]]:14: error: DC CIPAE requires: mec
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mec < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mec < %s \
+// RUN:        | llvm-objdump -d --mattr=+mec --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mec < %s \
+// RUN:        | llvm-objdump -d --mattr=-mec --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mec < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mec -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-MEC %s
+
+
+mrs x0, MECIDR_EL2
+// CHECK-INST: mrs x0, MECIDR_EL2
+// CHECK-ENCODING: encoding: [0xe0,0xa8,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca8e0 mrs x0, S3_4_C10_C8_7
+
+mrs x0, MECID_P0_EL2
+// CHECK-INST: mrs x0, MECID_P0_EL2
+// CHECK-ENCODING: encoding: [0x00,0xa8,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca800 mrs x0, S3_4_C10_C8_0
+
+mrs x0, MECID_A0_EL2
+// CHECK-INST: mrs x0, MECID_A0_EL2
+// CHECK-ENCODING: encoding: [0x20,0xa8,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca820 mrs x0, S3_4_C10_C8_1
+
+mrs x0, MECID_P1_EL2
+// CHECK-INST: mrs x0, MECID_P1_EL2
+// CHECK-ENCODING: encoding: [0x40,0xa8,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca840 mrs x0, S3_4_C10_C8_2
+
+mrs x0, MECID_A1_EL2
+// CHECK-INST: mrs x0, MECID_A1_EL2
+// CHECK-ENCODING: encoding: [0x60,0xa8,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca860 mrs x0, S3_4_C10_C8_3
+
+mrs x0, VMECID_P_EL2
+// CHECK-INST: mrs x0, VMECID_P_EL2
+// CHECK-ENCODING: encoding: [0x00,0xa9,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca900 mrs x0, S3_4_C10_C9_0
+
+mrs x0, VMECID_A_EL2
+// CHECK-INST: mrs x0, VMECID_A_EL2
+// CHECK-ENCODING: encoding: [0x20,0xa9,0x3c,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53ca920 mrs x0, S3_4_C10_C9_1
+
+mrs x0, MECID_RL_A_EL3
+// CHECK-INST: mrs x0, MECID_RL_A_EL3
+// CHECK-ENCODING: encoding: [0x20,0xaa,0x3e,0xd5]
+// CHECK-ERROR: error: expected readable system register
+// CHECK-UNKNOWN:  d53eaa20 mrs x0, S3_6_C10_C10_1
+
+msr MECID_P0_EL2,    x0
+// CHECK-INST: msr MECID_P0_EL2, x0
+// CHECK-ENCODING: encoding: [0x00,0xa8,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca800 msr S3_4_C10_C8_0, x0
+
+msr MECID_A0_EL2,    x0
+// CHECK-INST: msr MECID_A0_EL2, x0
+// CHECK-ENCODING: encoding: [0x20,0xa8,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca820 msr S3_4_C10_C8_1, x0
+
+msr MECID_P1_EL2,    x0
+// CHECK-INST: msr MECID_P1_EL2, x0
+// CHECK-ENCODING: encoding: [0x40,0xa8,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca840 msr S3_4_C10_C8_2, x0
+
+msr MECID_A1_EL2,    x0
+// CHECK-INST: msr MECID_A1_EL2, x0
+// CHECK-ENCODING: encoding: [0x60,0xa8,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca860 msr S3_4_C10_C8_3, x0
+
+msr VMECID_P_EL2,   x0
+// CHECK-INST: msr VMECID_P_EL2, x0
+// CHECK-ENCODING: encoding: [0x00,0xa9,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca900 msr S3_4_C10_C9_0, x0
+
+msr VMECID_A_EL2,   x0
+// CHECK-INST: msr VMECID_A_EL2, x0
+// CHECK-ENCODING: encoding: [0x20,0xa9,0x1c,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51ca920 msr S3_4_C10_C9_1, x0
+
+msr MECID_RL_A_EL3, x0
+// CHECK-INST: msr MECID_RL_A_EL3, x0
+// CHECK-ENCODING: encoding: [0x20,0xaa,0x1e,0xd5]
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-UNKNOWN:  d51eaa20 msr S3_6_C10_C10_1, x0
+
+dc cigdpae, x0
+// CHECK-INST: dc cigdpae, x0
+// CHECK-ENCODING: encoding: [0xe0,0x7e,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:4: error: DC CIGDPAE requires: mec
+// CHECK-UNKNOWN:  d50c7ee0 sys #4, c7, c14, #7, x0
+// CHECK-NO-MEC: sys #4, c7, c14, #7, x0
+
+dc cipae, x0
+// CHECK-INST: dc cipae, x0
+// CHECK-ENCODING: encoding: [0x00,0x7e,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:4: error: DC CIPAE requires: mec
+// CHECK-UNKNOWN:  d50c7e00 sys #4, c7, c14, #0, x0
+// CHECK-NO-MEC: sys #4, c7, c14, #0, x0
+
+sys #4, c7, c14, #7, x0
+// CHECK-INST: dc cigdpae, x0
+// CHECK-ENCODING: encoding: [0xe0,0x7e,0x0c,0xd5]
+// CHECK-UNKNOWN:  d50c7ee0 sys #4, c7, c14, #7, x0
+
+sys #4, c7, c14, #0, x0
+// CHECK-INST: dc cipae, x0
+// CHECK-ENCODING: encoding: [0x00,0x7e,0x0c,0xd5]
+// CHECK-UNKNOWN:  d50c7e00 sys #4, c7, c14, #0, x0
diff --git a/llvm/test/MC/AArch64/armv9.4-lse128.s b/llvm/test/MC/AArch64/armv9.4-lse128.s
deleted file mode 100644
index a639278..0000000
--- a/llvm/test/MC/AArch64/armv9.4-lse128.s
+++ /dev/null
@@ -1,98 +0,0 @@
-// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr +lse128 %s 2>%t | FileCheck %s
-// RUN: FileCheck %s --input-file=%t --check-prefix=ERROR-INVALID-OP
-// RUN: not llvm-mc -triple aarch64 -show-encoding %s 2>&1 | FileCheck --check-prefix=ERROR-NO-LSE128 %s
-
-ldclrp   x1, x2, [x11]
-// CHECK: ldclrp x1, x2, [x11]                   // encoding: [0x61,0x11,0x22,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrp   x21, x22, [sp]
-// CHECK: ldclrp x21, x22, [sp]                  // encoding: [0xf5,0x13,0x36,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpa  x1, x2, [x11]
-// CHECK: ldclrpa x1, x2, [x11]                   // encoding: [0x61,0x11,0xa2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpa  x21, x22, [sp]
-// CHECK: ldclrpa x21, x22, [sp]                  // encoding: [0xf5,0x13,0xb6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpal x1, x2, [x11]
-// CHECK: ldclrpal x1, x2, [x11]                   // encoding: [0x61,0x11,0xe2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpal x21, x22, [sp]
-// CHECK: ldclrpal x21, x22, [sp]                  // encoding: [0xf5,0x13,0xf6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpl  x1, x2, [x11]
-// CHECK: ldclrpl x1, x2, [x11]                   // encoding: [0x61,0x11,0x62,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpl  x21, x22, [sp]
-// CHECK: ldclrpl x21, x22, [sp]                  // encoding: [0xf5,0x13,0x76,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldclrpl  x22, xzr, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:15: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-ldclrpl  xzr, x22, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:10: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-
-ldsetp   x1, x2, [x11]
-// CHECK: ldsetp x1, x2, [x11]                   // encoding: [0x61,0x31,0x22,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetp   x21, x22, [sp]
-// CHECK: ldsetp x21, x22, [sp]                  // encoding: [0xf5,0x33,0x36,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpa  x1, x2, [x11]
-// CHECK: ldsetpa x1, x2, [x11]                   // encoding: [0x61,0x31,0xa2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpa  x21, x22, [sp]
-// CHECK: ldsetpa x21, x22, [sp]                  // encoding: [0xf5,0x33,0xb6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpal x1, x2, [x11]
-// CHECK: ldsetpal x1, x2, [x11]                   // encoding: [0x61,0x31,0xe2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpal x21, x22, [sp]
-// CHECK: ldsetpal x21, x22, [sp]                  // encoding: [0xf5,0x33,0xf6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpl  x1, x2, [x11]
-// CHECK: ldsetpl x1, x2, [x11]                   // encoding: [0x61,0x31,0x62,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpl  x21, x22, [sp]
-// CHECK: ldsetpl x21, x22, [sp]                  // encoding: [0xf5,0x33,0x76,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-ldsetpl  x22, xzr, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:15: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-ldsetpl  xzr, x22, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:10: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-
-
-swpp     x1, x2, [x11]
-// CHECK: swpp x1, x2, [x11]                   // encoding: [0x61,0x81,0x22,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swpp     x21, x22, [sp]
-// CHECK: swpp x21, x22, [sp]                  // encoding: [0xf5,0x83,0x36,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppa    x1, x2, [x11]
-// CHECK: swppa x1, x2, [x11]                   // encoding: [0x61,0x81,0xa2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppa    x21, x22, [sp]
-// CHECK: swppa x21, x22, [sp]                  // encoding: [0xf5,0x83,0xb6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppal   x1, x2, [x11]
-// CHECK: swppal x1, x2, [x11]                   // encoding: [0x61,0x81,0xe2,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppal   x21, x22, [sp]
-// CHECK: swppal x21, x22, [sp]                  // encoding: [0xf5,0x83,0xf6,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppl    x1, x2, [x11]
-// CHECK: swppl x1, x2, [x11]                   // encoding: [0x61,0x81,0x62,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppl    x21, x22, [sp]
-// CHECK: swppl x21, x22, [sp]                  // encoding: [0xf5,0x83,0x76,0x19]
-// ERROR-NO-LSE128: [[@LINE-2]]:1: error: instruction requires: lse128
-swppl    x22, xzr, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:15: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-swppl    xzr, x22, [sp]
-// ERROR-INVALID-OP: [[@LINE-1]]:10: error: invalid operand for instruction
-// ERROR-NO-LSE128: error: invalid operand for instruction
-
diff --git a/llvm/test/MC/AArch64/armv9.4a-chk.s b/llvm/test/MC/AArch64/armv9.4a-chk.s
index 95acee3..14b0c37 100644
--- a/llvm/test/MC/AArch64/armv9.4a-chk.s
+++ b/llvm/test/MC/AArch64/armv9.4a-chk.s
@@ -1,21 +1,38 @@
-// RUN: llvm-mc -triple aarch64 -mattr=+chk -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64 -mattr=+v8.9a -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64 -mattr=+v9.4a -show-encoding %s | FileCheck %s
-// RUN: llvm-mc -triple aarch64 -mattr=+v8a -show-encoding %s | FileCheck %s --check-prefix=NO-CHK
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v8.9a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+v9.4a < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+chk < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+chk < %s \
+// RUN:        | llvm-objdump -d --mattr=+chk - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+chk < %s \
+// RUN:   | llvm-objdump -d --mattr=-chk - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+chk < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+chk -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 // FEAT_CHK is mandatory from v8.0-a, but a clang user may not be using the LLVM
 // integrated assembler, so we cannot just print `chkfeat x16` in all
 // circumstances. Thankfully, we can always print `hint #40` when we cannot
 // print `chkfeat x16`.
-//
 // So, in this case, we only print `chkfeat x16` from v8.9-a onwards, as an
 // assembler that understands v8.9-a will understand `chkfeat x16`, and those
 // that understand previous versions may not.
 
 chkfeat x16
-// CHECK: chkfeat x16                       // encoding: [0x1f,0x25,0x03,0xd5]
-// NO-CHK: hint #40                              // encoding: [0x1f,0x25,0x03,0xd5]
+// CHECK-INST: chkfeat x16
+// CHECK-ENCODING: encoding: [0x1f,0x25,0x03,0xd5]
+// CHECK-ERROR: hint #40
+// CHECK-UNKNOWN:  d503251f      hint #40
 
 hint #40
-// CHECK: chkfeat x16                      // encoding: [0x1f,0x25,0x03,0xd5]
-// NO-CHK: hint #40                             // encoding: [0x1f,0x25,0x03,0xd5]
+// CHECK-INST: chkfeat x16
+// CHECK-ENCODING: encoding: [0x1f,0x25,0x03,0xd5]
+// CHECK-ERROR: hint #40
+// CHECK-UNKNOWN:  d503251f      hint #40
diff --git a/llvm/test/MC/AArch64/armv9.4a-ebep.s b/llvm/test/MC/AArch64/armv9.4a-ebep.s
index 7e9f111..2c7c714 100644
--- a/llvm/test/MC/AArch64/armv9.4a-ebep.s
+++ b/llvm/test/MC/AArch64/armv9.4a-ebep.s
@@ -1,9 +1,41 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 mrs x2, PM
+// CHECK-INST: mrs x2, PM
+// CHECK-ENCODING: encoding: [0x22,0x43,0x38,0xd5]
+// CHECK-UNKNOWN:  d5384322 mrs x2, PM
+
+mrs x3, PM
+// CHECK-INST: mrs x3, PM
+// CHECK-ENCODING: encoding: [0x23,0x43,0x38,0xd5]
+// CHECK-UNKNOWN:  d5384323 mrs x3, PM
+
 msr PM, x3
+// CHECK-INST: msr PM, x3
+// CHECK-ENCODING: encoding: [0x23,0x43,0x18,0xd5]
+// CHECK-UNKNOWN:  d5184323 msr PM, x3
+
+msr PM, x6
+// CHECK-INST: msr PM, x6
+// CHECK-ENCODING: encoding: [0x26,0x43,0x18,0xd5]
+// CHECK-UNKNOWN:  d5184326 msr PM, x6
+
+msr PM, #0
+// CHECK-INST: msr PM, #0
+// CHECK-ENCODING: encoding: [0x1f,0x42,0x01,0xd5]
+// CHECK-UNKNOWN:  d501421f msr PM, #0
+
 msr PM, #1
+// CHECK-INST: msr PM, #1
+// CHECK-ENCODING: encoding: [0x1f,0x43,0x01,0xd5]
+// CHECK-UNKNOWN:  d501431f msr PM, #1
 
-// CHECK:       mrs x2, {{pm|PM}} // encoding: [0x22,0x43,0x38,0xd5]
-// CHECK:       msr {{pm|PM}}, x3 // encoding: [0x23,0x43,0x18,0xd5]
-// CHECK:       msr {{pm|PM}}, #1 // encoding: [0x1f,0x43,0x01,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.4a-gcs.s b/llvm/test/MC/AArch64/armv9.4a-gcs.s
index b4af9b5..f702c94 100644
--- a/llvm/test/MC/AArch64/armv9.4a-gcs.s
+++ b/llvm/test/MC/AArch64/armv9.4a-gcs.s
@@ -1,115 +1,204 @@
-// RUN: llvm-mc -triple aarch64 -mattr +gcs -show-encoding %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding %s 2>%t | FileCheck %s --check-prefix=NO-GCS
-// RUN: FileCheck --check-prefix=ERROR-NO-GCS %s < %t
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcs < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcs < %s \
+// RUN:        | llvm-objdump -d --mattr=+gcs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcs < %s \
+// RUN:        | llvm-objdump -d --mattr=-gcs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcs < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+gcs -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 msr GCSCR_EL1, x0
+// CHECK-INST: msr GCSCR_EL1, x0
+// CHECK-ENCODING: encoding: [0x00,0x25,0x18,0xd5]
+// CHECK-UNKNOWN:  d5182500 msr GCSCR_EL1, x0
+
 mrs x1, GCSCR_EL1
-// CHECK: msr     GCSCR_EL1, x0                   // encoding: [0x00,0x25,0x18,0xd5]
-// CHECK: mrs     x1, GCSCR_EL1                   // encoding: [0x01,0x25,0x38,0xd5]
+// CHECK-INST: mrs x1, GCSCR_EL1
+// CHECK-ENCODING: encoding: [0x01,0x25,0x38,0xd5]
+// CHECK-UNKNOWN:  d5382501 mrs x1, GCSCR_EL1
 
 msr GCSPR_EL1, x2
+// CHECK-INST: msr GCSPR_EL1, x2
+// CHECK-ENCODING: encoding: [0x22,0x25,0x18,0xd5]
+// CHECK-UNKNOWN:  d5182522 msr GCSPR_EL1, x2
+
 mrs x3, GCSPR_EL1
-// CHECK: msr     GCSPR_EL1, x2                   // encoding: [0x22,0x25,0x18,0xd5]
-// CHECK: mrs     x3, GCSPR_EL1                   // encoding: [0x23,0x25,0x38,0xd5]
+// CHECK-INST: mrs x3, GCSPR_EL1
+// CHECK-ENCODING: encoding: [0x23,0x25,0x38,0xd5]
+// CHECK-UNKNOWN:  d5382523 mrs x3, GCSPR_EL1
 
 msr GCSCRE0_EL1, x4
+// CHECK-INST: msr GCSCRE0_EL1, x4
+// CHECK-ENCODING: encoding: [0x44,0x25,0x18,0xd5]
+// CHECK-UNKNOWN:  d5182544 msr GCSCRE0_EL1, x4
+
 mrs x5, GCSCRE0_EL1
-// CHECK: msr     GCSCRE0_EL1, x4                 // encoding: [0x44,0x25,0x18,0xd5]
-// CHECK: mrs     x5, GCSCRE0_EL1                 // encoding: [0x45,0x25,0x38,0xd5]
+// CHECK-INST: mrs x5, GCSCRE0_EL1
+// CHECK-ENCODING: encoding: [0x45,0x25,0x38,0xd5]
+// CHECK-UNKNOWN:  d5382545 mrs x5, GCSCRE0_EL1
 
 msr GCSPR_EL0, x6
+// CHECK-INST: msr GCSPR_EL0, x6
+// CHECK-ENCODING: encoding: [0x26,0x25,0x1b,0xd5]
+// CHECK-UNKNOWN:  d51b2526 msr GCSPR_EL0, x6
+
 mrs x7, GCSPR_EL0
-// CHECK: msr     GCSPR_EL0, x6                   // encoding: [0x26,0x25,0x1b,0xd5]
-// CHECK: mrs     x7, GCSPR_EL0                   // encoding: [0x27,0x25,0x3b,0xd5]
+// CHECK-INST: mrs x7, GCSPR_EL0
+// CHECK-ENCODING: encoding: [0x27,0x25,0x3b,0xd5]
+// CHECK-UNKNOWN:  d53b2527 mrs x7, GCSPR_EL0
 
 msr GCSCR_EL2, x10
+// CHECK-INST: msr GCSCR_EL2, x10
+// CHECK-ENCODING: encoding: [0x0a,0x25,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c250a msr GCSCR_EL2, x10
+
 mrs x11, GCSCR_EL2
-// CHECK: msr     GCSCR_EL2, x10                  // encoding: [0x0a,0x25,0x1c,0xd5]
-// CHECK: mrs     x11, GCSCR_EL2                  // encoding: [0x0b,0x25,0x3c,0xd5]
+// CHECK-INST: mrs x11, GCSCR_EL2
+// CHECK-ENCODING: encoding: [0x0b,0x25,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c250b mrs x11, GCSCR_EL2
 
 msr GCSPR_EL2, x12
+// CHECK-INST: msr GCSPR_EL2, x12
+// CHECK-ENCODING: encoding: [0x2c,0x25,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c252c msr GCSPR_EL2, x12
+
 mrs x13, GCSPR_EL2
-// CHECK: msr     GCSPR_EL2, x12                  // encoding: [0x2c,0x25,0x1c,0xd5]
-// CHECK: mrs     x13, GCSPR_EL2                  // encoding: [0x2d,0x25,0x3c,0xd5]
+// CHECK-INST: mrs x13, GCSPR_EL2
+// CHECK-ENCODING: encoding: [0x2d,0x25,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c252d mrs x13, GCSPR_EL2
 
 msr GCSCR_EL12, x14
+// CHECK-INST: msr GCSCR_EL12, x14
+// CHECK-ENCODING: encoding: [0x0e,0x25,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d250e msr GCSCR_EL12, x14
+
 mrs x15, GCSCR_EL12
-// CHECK: msr     GCSCR_EL12, x14                 // encoding: [0x0e,0x25,0x1d,0xd5]
-// CHECK: mrs     x15, GCSCR_EL12                 // encoding: [0x0f,0x25,0x3d,0xd5]
+// CHECK-INST: mrs x15, GCSCR_EL12
+// CHECK-ENCODING: encoding: [0x0f,0x25,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d250f mrs x15, GCSCR_EL12
 
 msr GCSPR_EL12, x16
+// CHECK-INST: msr GCSPR_EL12, x16
+// CHECK-ENCODING: encoding: [0x30,0x25,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d2530 msr GCSPR_EL12, x16
+
 mrs x17, GCSPR_EL12
-// CHECK: msr     GCSPR_EL12, x16                 // encoding: [0x30,0x25,0x1d,0xd5]
-// CHECK: mrs     x17, GCSPR_EL12                 // encoding: [0x31,0x25,0x3d,0xd5]
+// CHECK-INST: mrs x17, GCSPR_EL12
+// CHECK-ENCODING: encoding: [0x31,0x25,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d2531 mrs x17, GCSPR_EL12
 
 msr GCSCR_EL3, x18
+// CHECK-INST: msr GCSCR_EL3, x18
+// CHECK-ENCODING: encoding: [0x12,0x25,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e2512 msr GCSCR_EL3, x18
+
 mrs x19, GCSCR_EL3
-// CHECK: msr     GCSCR_EL3, x18                  // encoding: [0x12,0x25,0x1e,0xd5]
-// CHECK: mrs     x19, GCSCR_EL3                  // encoding: [0x13,0x25,0x3e,0xd5]
+// CHECK-INST: mrs x19, GCSCR_EL3
+// CHECK-ENCODING: encoding: [0x13,0x25,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e2513 mrs x19, GCSCR_EL3
 
 msr GCSPR_EL3, x20
+// CHECK-INST: msr GCSPR_EL3, x20
+// CHECK-ENCODING: encoding: [0x34,0x25,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e2534 msr GCSPR_EL3, x20
+
 mrs x21, GCSPR_EL3
-// CHECK: msr     GCSPR_EL3, x20                  // encoding: [0x34,0x25,0x1e,0xd5]
-// CHECK: mrs     x21, GCSPR_EL3                  // encoding: [0x35,0x25,0x3e,0xd5]
+// CHECK-INST: mrs x21, GCSPR_EL3
+// CHECK-ENCODING: encoding: [0x35,0x25,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e2535 mrs x21, GCSPR_EL3
 
 gcsss1 x21
-// CHECK: gcsss1  x21                        // encoding: [0x55,0x77,0x0b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcsss1 x21
+// CHECK-ENCODING: encoding: [0x55,0x77,0x0b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d50b7755 sys #3, c7, c7, #2, x21
 
 gcsss2 x22
-// CHECK: gcsss2  x22                        // encoding: [0x76,0x77,0x2b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcsss2 x22
+// CHECK-ENCODING: encoding: [0x76,0x77,0x2b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d52b7776 sysl x22, #3, c7, c7, #3
 
 gcspushm x25
-// CHECK: gcspushm x25                       // encoding: [0x19,0x77,0x0b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspushm x25
+// CHECK-ENCODING: encoding: [0x19,0x77,0x0b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d50b7719 sys #3, c7, c7, #0, x25
 
 gcspopm
-// CHECK: gcspopm                             // encoding: [0x3f,0x77,0x2b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspopm
+// CHECK-ENCODING: encoding: [0x3f,0x77,0x2b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d52b773f sysl xzr, #3, c7, c7, #1
 
 gcspopm xzr
-// CHECK: gcspopm                            // encoding: [0x3f,0x77,0x2b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspopm
+// CHECK-ENCODING: encoding: [0x3f,0x77,0x2b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d52b773f sysl xzr, #3, c7, c7, #1
 
 gcspopm x25
-// CHECK: gcspopm  x25                        // encoding: [0x39,0x77,0x2b,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
-
-gcsb dsync
-// CHECK: gcsb    dsync                           // encoding: [0x7f,0x22,0x03,0xd5]
-// ERROR-NO-GCS-NOT: [[@LINE-2]]:1: error: instruction requires: gcs
-// NO-GCS: hint #19                              // encoding: [0x7f,0x22,0x03,0xd5]
-
-hint #19
-// CHECK: gcsb    dsync                           // encoding: [0x7f,0x22,0x03,0xd5]
-// ERROR-NO-GCS-NOT: [[@LINE-2]]:1: error: instruction requires: gcs
-// NO-GCS: hint #19                              // encoding: [0x7f,0x22,0x03,0xd5]
+// CHECK-INST: gcspopm x25
+// CHECK-ENCODING: encoding: [0x39,0x77,0x2b,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d52b7739 sysl x25, #3, c7, c7, #1
 
 gcsstr x26, [x27]
-// CHECK: gcsstr x26, [x27]                        // encoding: [0x7a,0x0f,0x1f,0xd9]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcsstr x26, [x27]
+// CHECK-ENCODING: encoding: [0x7a,0x0f,0x1f,0xd9]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d91f0f7a <unknown>
 
 gcsstr x26, [sp]
-// CHECK: gcsstr x26, [sp]                         // encoding: [0xfa,0x0f,0x1f,0xd9]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcsstr x26, [sp]
+// CHECK-ENCODING: encoding: [0xfa,0x0f,0x1f,0xd9]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d91f0ffa <unknown>
 
 gcssttr x26, [x27]
-// CHECK: gcssttr x26, [x27]                       // encoding: [0x7a,0x1f,0x1f,0xd9]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcssttr x26, [x27]
+// CHECK-ENCODING: encoding: [0x7a,0x1f,0x1f,0xd9]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d91f1f7a <unknown>
 
 gcssttr x26, [sp]
-// CHECK: gcssttr x26, [sp]                        // encoding: [0xfa,0x1f,0x1f,0xd9]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcssttr x26, [sp]
+// CHECK-ENCODING: encoding: [0xfa,0x1f,0x1f,0xd9]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d91f1ffa <unknown>
 
 gcspushx
-// CHECK: gcspushx                          // encoding: [0x9f,0x77,0x08,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspushx
+// CHECK-ENCODING: encoding: [0x9f,0x77,0x08,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d508779f sys #0, c7, c7, #4
 
 gcspopcx
-// CHECK: gcspopcx                          // encoding: [0xbf,0x77,0x08,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspopcx
+// CHECK-ENCODING: encoding: [0xbf,0x77,0x08,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d50877bf sys #0, c7, c7, #5
 
 gcspopx
-// CHECK: gcspopx                           // encoding: [0xdf,0x77,0x08,0xd5]
-// ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs
+// CHECK-INST: gcspopx
+// CHECK-ENCODING: encoding: [0xdf,0x77,0x08,0xd5]
+// CHECK-ERROR: error: instruction requires: gcs
+// CHECK-UNKNOWN:  d50877df sys #0, c7, c7, #6
+
+gcsb dsync
+// CHECK-INST: gcsb dsync
+// CHECK-ENCODING: encoding: [0x7f,0x22,0x03,0xd5]
+// CHECK-UNKNOWN:  d503227f hint #19
+// CHECK-ERROR: hint #19                              // encoding: [0x7f,0x22,0x03,0xd5]
+
+hint #19
+// CHECK-INST: gcsb dsync
+// CHECK-ENCODING: encoding: [0x7f,0x22,0x03,0xd5]
+// CHECK-UNKNOWN:  d503227f hint #19
+// CHECK-ERROR: hint #19                              // encoding: [0x7f,0x22,0x03,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.4a-lse128-diagnostics.s b/llvm/test/MC/AArch64/armv9.4a-lse128-diagnostics.s
new file mode 100644
index 0000000..059b18f
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.4a-lse128-diagnostics.s
@@ -0,0 +1,20 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+
+ldclrpl  x22, xzr, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+
+ldclrpl  xzr, x22, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+
+ldsetpl  x22, xzr, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+
+ldsetpl  xzr, x22, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+
+swppl    x22, xzr, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+
+swppl    xzr, x22, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
diff --git a/llvm/test/MC/AArch64/armv9.4a-lse128.s b/llvm/test/MC/AArch64/armv9.4a-lse128.s
new file mode 100644
index 0000000..25dcb04
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.4a-lse128.s
@@ -0,0 +1,159 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lse128 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lse128 < %s \
+// RUN:        | llvm-objdump -d --mattr=+lse128 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lse128 < %s \
+// RUN:        | llvm-objdump -d --mattr=-lse128 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lse128 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+lse128 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+
+ldclrp   x1, x2, [x11]
+// CHECK-INST: ldclrp x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x11,0x22,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19221161 <unknown>
+
+ldclrp   x21, x22, [sp]
+// CHECK-INST: ldclrp x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x13,0x36,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  193613f5 <unknown>
+
+ldclrpa  x1, x2, [x11]
+// CHECK-INST: ldclrpa x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x11,0xa2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19a21161 <unknown>
+
+ldclrpa  x21, x22, [sp]
+// CHECK-INST: ldclrpa x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x13,0xb6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19b613f5 <unknown>
+
+ldclrpal x1, x2, [x11]
+// CHECK-INST: ldclrpal x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x11,0xe2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19e21161 <unknown>
+
+ldclrpal x21, x22, [sp]
+// CHECK-INST: ldclrpal x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x13,0xf6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19f613f5 <unknown>
+
+ldclrpl  x1, x2, [x11]
+// CHECK-INST: ldclrpl x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x11,0x62,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19621161 <unknown>
+
+ldclrpl  x21, x22, [sp]
+// CHECK-INST: ldclrpl x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x13,0x76,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  197613f5 <unknown>
+
+ldsetp   x1, x2, [x11]
+// CHECK-INST: ldsetp x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x31,0x22,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19223161 <unknown>
+
+ldsetp   x21, x22, [sp]
+// CHECK-INST: ldsetp x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x33,0x36,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  193633f5 <unknown>
+
+ldsetpa  x1, x2, [x11]
+// CHECK-INST: ldsetpa x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x31,0xa2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19a23161 <unknown>
+
+ldsetpa  x21, x22, [sp]
+// CHECK-INST: ldsetpa x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x33,0xb6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19b633f5 <unknown>
+
+ldsetpal x1, x2, [x11]
+// CHECK-INST: ldsetpal x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x31,0xe2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19e23161 <unknown>
+
+ldsetpal x21, x22, [sp]
+// CHECK-INST: ldsetpal x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x33,0xf6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19f633f5 <unknown>
+
+ldsetpl  x1, x2, [x11]
+// CHECK-INST: ldsetpl x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x31,0x62,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19623161 <unknown>
+
+ldsetpl  x21, x22, [sp]
+// CHECK-INST: ldsetpl x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x33,0x76,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  197633f5 <unknown>
+
+swpp     x1, x2, [x11]
+// CHECK-INST: swpp x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x81,0x22,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19228161 <unknown>
+
+swpp     x21, x22, [sp]
+// CHECK-INST: swpp x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x83,0x36,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  193683f5 <unknown>
+
+swppa    x1, x2, [x11]
+// CHECK-INST: swppa x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x81,0xa2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19a28161 <unknown>
+
+swppa    x21, x22, [sp]
+// CHECK-INST: swppa x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x83,0xb6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19b683f5 <unknown>
+
+swppal   x1, x2, [x11]
+// CHECK-INST: swppal x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x81,0xe2,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19e28161 <unknown>
+
+swppal   x21, x22, [sp]
+// CHECK-INST: swppal x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x83,0xf6,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19f683f5 <unknown>
+
+swppl    x1, x2, [x11]
+// CHECK-INST: swppl x1, x2, [x11]
+// CHECK-ENCODING: encoding: [0x61,0x81,0x62,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  19628161 <unknown>
+
+swppl    x21, x22, [sp]
+// CHECK-INST: swppl x21, x22, [sp]
+// CHECK-ENCODING: encoding: [0xf5,0x83,0x76,0x19]
+// CHECK-ERROR: :[[@LINE-3]]:1: error: instruction requires: lse128
+// CHECK-UNKNOWN:  197683f5 <unknown>
diff --git a/llvm/test/MC/AArch64/armv9.5a-cpa.s b/llvm/test/MC/AArch64/armv9.5a-cpa.s
index 1c338ec..d239224 100644
--- a/llvm/test/MC/AArch64/armv9.5a-cpa.s
+++ b/llvm/test/MC/AArch64/armv9.5a-cpa.s
@@ -1,50 +1,87 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+cpa < %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-CPA %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cpa < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cpa < %s \
+// RUN:        | llvm-objdump -d --mattr=+cpa - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cpa < %s \
+// RUN:        | llvm-objdump -d --mattr=-cpa - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cpa < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+cpa -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 addpt x0, x1, x2
-// CHECK: addpt x0, x1, x2               // encoding: [0x20,0x20,0x02,0x9a]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: addpt x0, x1, x2
+// CHECK-ENCODING: encoding: [0x20,0x20,0x02,0x9a]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9a022020 <unknown>
 
 addpt sp, sp, x2
-// CHECK: addpt sp, sp, x2               // encoding: [0xff,0x23,0x02,0x9a]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: addpt sp, sp, x2
+// CHECK-ENCODING: encoding: [0xff,0x23,0x02,0x9a]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9a0223ff <unknown>
 
 addpt x0, x1, x2, lsl #0
-// CHECK: addpt x0, x1, x2               // encoding: [0x20,0x20,0x02,0x9a]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: addpt x0, x1, x2
+// CHECK-ENCODING: encoding: [0x20,0x20,0x02,0x9a]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9a022020 <unknown>
 
 addpt x0, x1, x2, lsl #7
-// CHECK: addpt x0, x1, x2, lsl #7       // encoding: [0x20,0x3c,0x02,0x9a]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: addpt x0, x1, x2, lsl #7
+// CHECK-ENCODING: encoding: [0x20,0x3c,0x02,0x9a]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9a023c20 <unknown>
 
 addpt sp, sp, x2, lsl #7
-// CHECK: addpt sp, sp, x2, lsl #7       // encoding: [0xff,0x3f,0x02,0x9a]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: addpt sp, sp, x2, lsl #7
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x02,0x9a]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9a023fff <unknown>
 
 subpt x0, x1, x2
-// CHECK: subpt x0, x1, x2               // encoding: [0x20,0x20,0x02,0xda]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: subpt x0, x1, x2
+// CHECK-ENCODING: encoding: [0x20,0x20,0x02,0xda]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  da022020 <unknown>
 
 subpt sp, sp, x2
-// CHECK: subpt sp, sp, x2               // encoding: [0xff,0x23,0x02,0xda]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: subpt sp, sp, x2
+// CHECK-ENCODING: encoding: [0xff,0x23,0x02,0xda]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  da0223ff <unknown>
 
 subpt x0, x1, x2, lsl #0
-// CHECK: subpt x0, x1, x2               // encoding: [0x20,0x20,0x02,0xda]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: subpt x0, x1, x2
+// CHECK-ENCODING: encoding: [0x20,0x20,0x02,0xda]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  da022020 <unknown>
 
 subpt x0, x1, x2, lsl #7
-// CHECK: subpt x0, x1, x2, lsl #7       // encoding: [0x20,0x3c,0x02,0xda]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: subpt x0, x1, x2, lsl #7
+// CHECK-ENCODING: encoding: [0x20,0x3c,0x02,0xda]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  da023c20 <unknown>
 
 subpt sp, sp, x2, lsl #7
-// CHECK: subpt sp, sp, x2, lsl #7       // encoding: [0xff,0x3f,0x02,0xda]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: subpt sp, sp, x2, lsl #7
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x02,0xda]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  da023fff <unknown>
 
 maddpt x0, x1, x2, x3
-// CHECK: maddpt x0, x1, x2, x3          // encoding: [0x20,0x0c,0x62,0x9b]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: maddpt x0, x1, x2, x3
+// CHECK-ENCODING: encoding: [0x20,0x0c,0x62,0x9b]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9b620c20 <unknown>
 
 msubpt x0, x1, x2, x3
-// CHECK: msubpt x0, x1, x2, x3          // encoding: [0x20,0x8c,0x62,0x9b]
-// ERROR-NO-CPA: error: instruction requires: cpa
+// CHECK-INST: msubpt x0, x1, x2, x3
+// CHECK-ENCODING: encoding: [0x20,0x8c,0x62,0x9b]
+// CHECK-ERROR: error: instruction requires: cpa
+// CHECK-UNKNOWN:  9b628c20 <unknown>
diff --git a/llvm/test/MC/AArch64/armv9.5a-e3dse.s b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
index b69d49a..9d9798a 100644
--- a/llvm/test/MC/AArch64/armv9.5a-e3dse.s
+++ b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
@@ -1,13 +1,31 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 mrs x0, VDISR_EL3
-// CHECK: mrs x0, VDISR_EL3                  // encoding: [0x20,0xc1,0x3e,0xd5]
+// CHECK-INST: mrs x0, VDISR_EL3
+// CHECK-ENCODING: encoding: [0x20,0xc1,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53ec120 mrs x0, VDISR_EL3
 
 msr VDISR_EL3, x0
-// CHECK: msr VDISR_EL3, x0                  // encoding: [0x20,0xc1,0x1e,0xd5]
+// CHECK-INST: msr VDISR_EL3, x0
+// CHECK-ENCODING: encoding: [0x20,0xc1,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51ec120 msr VDISR_EL3, x0
 
 mrs x0, VSESR_EL3
-// CHECK: mrs x0, VSESR_EL3                  // encoding: [0x60,0x52,0x3e,0xd5]
+// CHECK-INST: mrs x0, VSESR_EL3
+// CHECK-ENCODING: encoding: [0x60,0x52,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e5260 mrs x0, VSESR_EL3
 
 msr VSESR_EL3, x0
-// CHECK: msr VSESR_EL3, x0                  // encoding: [0x60,0x52,0x1e,0xd5]
+// CHECK-INST: msr VSESR_EL3, x0
+// CHECK-ENCODING: encoding: [0x60,0x52,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e5260 msr VSESR_EL3, x0
diff --git a/llvm/test/MC/AArch64/armv9.5a-fgwte3.s b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
index 2352bc7e..6546d51 100644
--- a/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
+++ b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
@@ -1,6 +1,20 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 mrs x0, FGWTE3_EL3
-// CHECK: mrs x0, FGWTE3_EL3                  // encoding: [0xa0,0x11,0x3e,0xd5]
+// CHECK-INST: mrs x0, FGWTE3_EL3
+// CHECK-ENCODING: encoding: [0xa0,0x11,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e11a0 mrs x0, FGWTE3_EL3
+
 msr FGWTE3_EL3, x0
-// CHECK: msr FGWTE3_EL3, x0                  // encoding: [0xa0,0x11,0x1e,0xd5]
+// CHECK-INST: msr FGWTE3_EL3, x0
+// CHECK-ENCODING: encoding: [0xa0,0x11,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e11a0 msr FGWTE3_EL3, x0
diff --git a/llvm/test/MC/AArch64/armv9.5a-hacdbs.s b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
index 8ccba29..e1d1aaa 100644
--- a/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
+++ b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
@@ -1,12 +1,31 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 mrs x0, HACDBSBR_EL2
-// CHECK: mrs x0, HACDBSBR_EL2                  // encoding: [0x80,0x23,0x3c,0xd5]
+// CHECK-INST: mrs x0, HACDBSBR_EL2
+// CHECK-ENCODING: encoding: [0x80,0x23,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c2380 mrs x0, HACDBSBR_EL2
+
 msr HACDBSBR_EL2, x0
-// CHECK: msr HACDBSBR_EL2, x0                  // encoding: [0x80,0x23,0x1c,0xd5]
+// CHECK-INST: msr HACDBSBR_EL2, x0
+// CHECK-ENCODING: encoding: [0x80,0x23,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2380 msr HACDBSBR_EL2, x0
 
 mrs x0, HACDBSCONS_EL2
-// CHECK: mrs x0, HACDBSCONS_EL2                  // encoding: [0xa0,0x23,0x3c,0xd5]
+// CHECK-INST: mrs x0, HACDBSCONS_EL2
+// CHECK-ENCODING: encoding: [0xa0,0x23,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c23a0 mrs x0, HACDBSCONS_EL2
+
 msr HACDBSCONS_EL2, x0
-// CHECK: msr HACDBSCONS_EL2, x0                  // encoding: [0xa0,0x23,0x1c,0xd5]
+// CHECK-INST: msr HACDBSCONS_EL2, x0
+// CHECK-ENCODING: encoding: [0xa0,0x23,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c23a0 msr HACDBSCONS_EL2, x0
 
diff --git a/llvm/test/MC/AArch64/armv9.5a-hdbss.s b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
index c4505c9..3e18fe3 100644
--- a/llvm/test/MC/AArch64/armv9.5a-hdbss.s
+++ b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
@@ -1,12 +1,32 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 mrs x0, HDBSSBR_EL2
-// CHECK: mrs x0, HDBSSBR_EL2                  // encoding: [0x40,0x23,0x3c,0xd5]
+// CHECK-INST: mrs x0, HDBSSBR_EL2
+// CHECK-ENCODING: encoding: [0x40,0x23,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c2340 mrs x0, HDBSSBR_EL2
+
 msr HDBSSBR_EL2, x0
-// CHECK: msr HDBSSBR_EL2, x0                  // encoding: [0x40,0x23,0x1c,0xd5]
+// CHECK-INST: msr HDBSSBR_EL2, x0
+// CHECK-ENCODING: encoding: [0x40,0x23,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2340 msr HDBSSBR_EL2, x0
 
 mrs x0, HDBSSPROD_EL2
-// CHECK: mrs x0, HDBSSPROD_EL2                  // encoding: [0x60,0x23,0x3c,0xd5]
+// CHECK-INST: mrs x0, HDBSSPROD_EL2
+// CHECK-ENCODING: encoding: [0x60,0x23,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c2360 mrs x0, HDBSSPROD_EL2
+
 msr HDBSSPROD_EL2, x0
-// CHECK: msr HDBSSPROD_EL2, x0                  // encoding: [0x60,0x23,0x1c,0xd5]
+// CHECK-INST: msr HDBSSPROD_EL2, x0
+// CHECK-ENCODING: encoding: [0x60,0x23,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2360 msr HDBSSPROD_EL2, x0
 
diff --git a/llvm/test/MC/AArch64/armv9.5a-spmu2.s b/llvm/test/MC/AArch64/armv9.5a-spmu2.s
index b7febdb..5177098 100644
--- a/llvm/test/MC/AArch64/armv9.5a-spmu2.s
+++ b/llvm/test/MC/AArch64/armv9.5a-spmu2.s
@@ -1,4 +1,16 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 msr SPMZR_EL0, x0
-// CHECK: msr SPMZR_EL0, x0                  // encoding: [0x80,0x9c,0x13,0xd5]
+// CHECK-INST: msr SPMZR_EL0, x0
+// CHECK-ENCODING: encoding: [0x80,0x9c,0x13,0xd5]
+// CHECK-UNKNOWN:  d5139c80 msr SPMZR_EL0, x0
diff --git a/llvm/test/MC/AArch64/armv9.5a-step2.s b/llvm/test/MC/AArch64/armv9.5a-step2.s
index c5f226b..5d07685 100644
--- a/llvm/test/MC/AArch64/armv9.5a-step2.s
+++ b/llvm/test/MC/AArch64/armv9.5a-step2.s
@@ -1,7 +1,21 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 mrs x0, MDSTEPOP_EL1
-// CHECK: mrs x0, MDSTEPOP_EL1                  // encoding: [0x40,0x05,0x30,0xd5]
+// CHECK-INST: mrs x0, MDSTEPOP_EL1
+// CHECK-ENCODING: encoding: [0x40,0x05,0x30,0xd5]
+// CHECK-UNKNOWN:  d5300540 mrs x0, MDSTEPOP_EL1
 
 msr MDSTEPOP_EL1, x0
-// CHECK: msr MDSTEPOP_EL1, x0                  // encoding: [0x40,0x05,0x10,0xd5]
+// CHECK-INST: msr MDSTEPOP_EL1, x0
+// CHECK-ENCODING: encoding: [0x40,0x05,0x10,0xd5]
+// CHECK-UNKNOWN:  d5100540 msr MDSTEPOP_EL1, x0
diff --git a/llvm/test/MC/AArch64/armv9.5a-tlbiw.s b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
index 435ed06..efd410c 100644
--- a/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
+++ b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
@@ -1,27 +1,50 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw < %s 2> %t | FileCheck --check-prefix=CHECK-TLBIW %s && FileCheck --check-prefix=ERROR-NO-XS-TLBIW %s < %t
-// RUN: not llvm-mc -triple aarch64 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-TLBIW --check-prefix=ERROR-NO-XS-TLBIW %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbiw,+xs < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbiw,+xs < %s \
+// RUN:        | llvm-objdump -d --mattr=+tlbiw,+xs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbiw,+xs < %s \
+// RUN:   | llvm-objdump -d --mattr=-tlbiw,-xs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbiw,+xs < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+tlbiw,+xs -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
 
 tlbi VMALLWS2E1
-// CHECK-TLBIW: tlbi vmallws2e1                  // encoding: [0x5f,0x86,0x0c,0xd5]
-// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1 requires: tlbiw
+// CHECK-INST: tlbi vmallws2e1
+// CHECK-ENCODING: encoding: [0x5f,0x86,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1 requires: tlbiw
+// CHECK-UNKNOWN:  d50c865f      sys #4, c8, c6, #2
 
 tlbi VMALLWS2E1IS
-// CHECK-TLBIW: tlbi vmallws2e1is                // encoding: [0x5f,0x82,0x0c,0xd5]
-// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1IS requires: tlbiw
+// CHECK-INST: tlbi vmallws2e1is
+// CHECK-ENCODING: encoding: [0x5f,0x82,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1IS requires: tlbiw
+// CHECK-UNKNOWN:  d50c825f      sys #4, c8, c2, #2
 
 tlbi VMALLWS2E1OS
-// CHECK-TLBIW: tlbi vmallws2e1os                // encoding: [0x5f,0x85,0x0c,0xd5]
-// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OS requires: tlbiw
+// CHECK-INST: tlbi vmallws2e1os
+// CHECK-ENCODING: encoding: [0x5f,0x85,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1OS requires: tlbiw
+// CHECK-UNKNOWN:  d50c855f      sys #4, c8, c5, #2
 
 tlbi VMALLWS2E1nXS
-// CHECK-XS: tlbi vmallws2e1nxs                  // encoding: [0x5f,0x96,0x0c,0xd5]
-// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1nXS requires: xs, tlbiw
+// CHECK-INST: tlbi vmallws2e1nxs
+// CHECK-ENCODING: encoding: [0x5f,0x96,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1nXS requires: xs, tlbiw
+// CHECK-UNKNOWN:  d50c965f      sys #4, c9, c6, #2
 
 tlbi VMALLWS2E1ISnXS
-// CHECK-XS: tlbi vmallws2e1isnxs                // encoding: [0x5f,0x92,0x0c,0xd5]
-// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1ISnXS requires: xs, tlbiw
+// CHECK-INST: tlbi vmallws2e1isnxs
+// CHECK-ENCODING: encoding: [0x5f,0x92,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1ISnXS requires: xs, tlbiw
+// CHECK-UNKNOWN:  d50c925f      sys #4, c9, c2, #2
 
 tlbi VMALLWS2E1OSnXS
-// CHECK-XS: tlbi vmallws2e1osnxs                // encoding: [0x5f,0x95,0x0c,0xd5]
-// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OSnXS requires: xs, tlbiw
+// CHECK-INST: tlbi vmallws2e1osnxs
+// CHECK-ENCODING: encoding: [0x5f,0x95,0x0c,0xd5]
+// CHECK-ERROR: :[[@LINE-3]]:6: error: TLBI VMALLWS2E1OSnXS requires: xs, tlbiw
+// CHECK-UNKNOWN:  d50c955f      sys #4, c9, c5, #2
diff --git a/llvm/test/MC/AArch64/armv9.6a-lsui.s b/llvm/test/MC/AArch64/armv9.6a-lsui.s
index dcd2693..63a1889 100644
--- a/llvm/test/MC/AArch64/armv9.6a-lsui.s
+++ b/llvm/test/MC/AArch64/armv9.6a-lsui.s
@@ -1,408 +1,714 @@
-// RUN: llvm-mc -triple aarch64 -mattr=+lsui -show-encoding %s  | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 -show-encoding %s 2>&1  | FileCheck %s --check-prefix=ERROR
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsui < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsui < %s \
+// RUN:        | llvm-objdump -d --mattr=+lsui --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+lsui < %s \
+// RUN:        | llvm-objdump -d --mattr=-lsui --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+lsui < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+lsui -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
-_func:
-// CHECK: _func:
 //------------------------------------------------------------------------------
 // Unprivileged load/store operations
 //------------------------------------------------------------------------------
-  ldtxr       x9, [sp]
-// CHECK: ldtxr	x9, [sp]                        // encoding: [0xe9,0x7f,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-  ldtxr       x9, [sp, #0]
-// CHECK: ldtxr	x9, [sp]                        // encoding: [0xe9,0x7f,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-  ldtxr       x10, [x11]
-// CHECK: ldtxr	x10, [x11]                      // encoding: [0x6a,0x7d,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-  ldtxr       x10, [x11, #0]
-// CHECK: ldtxr	x10, [x11]                      // encoding: [0x6a,0x7d,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-
-  ldatxr      x9, [sp]
-// CHECK: ldatxr	x9, [sp]                        // encoding: [0xe9,0xff,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-  ldatxr      x10, [x11]
-// CHECK: ldatxr	x10, [x11]                      // encoding: [0x6a,0xfd,0x5f,0xc9]
-// ERROR: error: instruction requires: lsui
-
-  sttxr       wzr, w4, [sp]
-// CHECK: sttxr	wzr, w4, [sp]                   // encoding: [0xe4,0x7f,0x1f,0x89]
-// ERROR: error: instruction requires: lsui
-  sttxr       wzr, w4, [sp, #0]
-// CHECK: sttxr	wzr, w4, [sp]                   // encoding: [0xe4,0x7f,0x1f,0x89]
-// ERROR: error: instruction requires: lsui
-  sttxr       w5, x6, [x7]
-// CHECK: sttxr	w5, x6, [x7]                    // encoding: [0xe6,0x7c,0x05,0xc9]
-// ERROR: error: instruction requires: lsui
-  sttxr       w5, x6, [x7, #0]
-// CHECK: sttxr	w5, x6, [x7]                    // encoding: [0xe6,0x7c,0x05,0xc9]
-// ERROR: error: instruction requires: lsui
-
-  stltxr      w2, w4, [sp]
-// CHECK: stltxr	w2, w4, [sp]                    // encoding: [0xe4,0xff,0x02,0x89]
-// ERROR: error: instruction requires: lsui
-  stltxr      w5, x6, [x7]
-// CHECK: stltxr	w5, x6, [x7]                    // encoding: [0xe6,0xfc,0x05,0xc9]
-// ERROR: error: instruction requires: lsui
+ldtxr x9, [sp]
+// CHECK-INST: ldtxr x9, [sp]
+// CHECK-ENCODING: encoding: [0xe9,0x7f,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95f7fe9 <unknown>
+
+ldtxr x9, [sp, #0]
+// CHECK-INST: ldtxr x9, [sp]
+// CHECK-ENCODING: encoding: [0xe9,0x7f,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95f7fe9 <unknown>
+
+ldtxr x10, [x11]
+// CHECK-INST: ldtxr x10, [x11]
+// CHECK-ENCODING: encoding: [0x6a,0x7d,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95f7d6a <unknown>
+
+ldtxr x10, [x11, #0]
+// CHECK-INST: ldtxr x10, [x11]
+// CHECK-ENCODING: encoding: [0x6a,0x7d,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95f7d6a <unknown>
+
+ldatxr x9, [sp]
+// CHECK-INST: ldatxr x9, [sp]
+// CHECK-ENCODING: encoding: [0xe9,0xff,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95fffe9 <unknown>
+
+ldatxr x10, [x11]
+// CHECK-INST: ldatxr x10, [x11]
+// CHECK-ENCODING: encoding: [0x6a,0xfd,0x5f,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c95ffd6a <unknown>
+
+sttxr wzr, w4, [sp]
+// CHECK-INST: sttxr wzr, w4, [sp]
+// CHECK-ENCODING: encoding: [0xe4,0x7f,0x1f,0x89]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  891f7fe4 <unknown>
+
+sttxr wzr, w4, [sp, #0]
+// CHECK-INST: sttxr wzr, w4, [sp]
+// CHECK-ENCODING: encoding: [0xe4,0x7f,0x1f,0x89]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  891f7fe4 <unknown>
+
+sttxr w5, x6, [x7]
+// CHECK-INST: sttxr w5, x6, [x7]
+// CHECK-ENCODING: encoding: [0xe6,0x7c,0x05,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9057ce6 <unknown>
+
+sttxr w5, x6, [x7, #0]
+// CHECK-INST: sttxr w5, x6, [x7]
+// CHECK-ENCODING: encoding: [0xe6,0x7c,0x05,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9057ce6 <unknown>
+
+stltxr w2, w4, [sp]
+// CHECK-INST: stltxr w2, w4, [sp]
+// CHECK-ENCODING: encoding: [0xe4,0xff,0x02,0x89]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  8902ffe4 <unknown>
+
+stltxr w5, x6, [x7]
+// CHECK-INST: stltxr w5, x6, [x7]
+// CHECK-ENCODING: encoding: [0xe6,0xfc,0x05,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c905fce6 <unknown>
 
 //------------------------------------------------------------------------------
 // Unprivileged load/store register pair (offset)
 //------------------------------------------------------------------------------
 
-  ldtp       x21, x29, [x2, #504]
-// CHECK: ldtp	x21, x29, [x2, #504]            // encoding: [0x55,0xf4,0x5f,0xe9]
-// ERROR: instruction requires: lsui
-  ldtp       x22, x23, [x3, #-512]
-// CHECK: ldtp	x22, x23, [x3, #-512]           // encoding: [0x76,0x5c,0x60,0xe9]
-// ERROR: instruction requires: lsui
-  ldtp       x24, x25, [x4, #8]
-// CHECK: ldtp	x24, x25, [x4, #8]              // encoding: [0x98,0xe4,0x40,0xe9]
-// ERROR: instruction requires: lsui
-
-  sttp       x3, x5, [sp], #16
-// CHECK: sttp	x3, x5, [sp], #16               // encoding: [0xe3,0x17,0x81,0xe8]
-// ERROR: instruction requires: lsui
-  sttp       x3, x5, [sp, #8]!
-// CHECK: sttp	x3, x5, [sp, #8]!               // encoding: [0xe3,0x97,0x80,0xe9]
-// ERROR: instruction requires: lsui
-
-  sttp       q3, q5, [sp]
-// CHECK: sttp	q3, q5, [sp]                    // encoding: [0xe3,0x17,0x00,0xed]
-// ERROR: instruction requires: lsui
-  sttp       q17, q19, [sp, #1008]
-// CHECK: sttp	q17, q19, [sp, #1008]           // encoding: [0xf1,0xcf,0x1f,0xed]
-// ERROR: instruction requires: lsui
+ldtp x21, x29, [x2, #504]
+// CHECK-INST: ldtp x21, x29, [x2, #504]
+// CHECK-ENCODING: encoding: [0x55,0xf4,0x5f,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e95ff455 <unknown>
+
+ldtp x22, x23, [x3, #-512]
+// CHECK-INST: ldtp x22, x23, [x3, #-512]
+// CHECK-ENCODING: encoding: [0x76,0x5c,0x60,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e9605c76 <unknown>
+
+ldtp x24, x25, [x4, #8]
+// CHECK-INST: ldtp x24, x25, [x4, #8]
+// CHECK-ENCODING: encoding: [0x98,0xe4,0x40,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e940e498 <unknown>
+
+sttp x3, x5, [sp], #16
+// CHECK-INST: sttp x3, x5, [sp], #16
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x81,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e88117e3 <unknown>
+
+sttp x3, x5, [sp, #8]!
+// CHECK-INST: sttp x3, x5, [sp, #8]!
+// CHECK-ENCODING: encoding: [0xe3,0x97,0x80,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e98097e3 <unknown>
+
+sttp q3, q5, [sp]
+// CHECK-INST: sttp q3, q5, [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x00,0xed]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ed0017e3 <unknown>
+
+sttp q17, q19, [sp, #1008]
+// CHECK-INST: sttp q17, q19, [sp, #1008]
+// CHECK-ENCODING: encoding: [0xf1,0xcf,0x1f,0xed]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ed1fcff1 <unknown>
 
 //------------------------------------------------------------------------------
 // Load/store register pair (post-indexed)
 //------------------------------------------------------------------------------
 
-  ldtp       x21, x29, [x2], #504
-// CHECK: ldtp	x21, x29, [x2], #504            // encoding: [0x55,0xf4,0xdf,0xe8]
-// ERROR: instruction requires: lsui
-  ldtp       x22, x23, [x3], #-512
-// CHECK: ldtp	x22, x23, [x3], #-512           // encoding: [0x76,0x5c,0xe0,0xe8]
-// ERROR: instruction requires: lsui
-  ldtp       x24, x25, [x4], #8
-// CHECK: ldtp	x24, x25, [x4], #8              // encoding: [0x98,0xe4,0xc0,0xe8]
-// ERROR: instruction requires: lsui
-
-  sttp       q3, q5, [sp], #0
-// CHECK: sttp	q3, q5, [sp], #0                // encoding: [0xe3,0x17,0x80,0xec]
-// ERROR: instruction requires: lsui
-  sttp       q17, q19, [sp], #1008
-// CHECK: sttp	q17, q19, [sp], #1008           // encoding: [0xf1,0xcf,0x9f,0xec]
-// ERROR: instruction requires: lsui
-  ldtp       q23, q29, [x1], #-1024
-// CHECK: ldtp	q23, q29, [x1], #-1024          // encoding: [0x37,0x74,0xe0,0xec]
-// ERROR: instruction requires: lsui
+ldtp x21, x29, [x2], #504
+// CHECK-INST: ldtp x21, x29, [x2], #504
+// CHECK-ENCODING: encoding: [0x55,0xf4,0xdf,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e8dff455 <unknown>
+
+ldtp x22, x23, [x3], #-512
+// CHECK-INST: ldtp x22, x23, [x3], #-512
+// CHECK-ENCODING: encoding: [0x76,0x5c,0xe0,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e8e05c76 <unknown>
+
+ldtp x24, x25, [x4], #8
+// CHECK-INST: ldtp x24, x25, [x4], #8
+// CHECK-ENCODING: encoding: [0x98,0xe4,0xc0,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e8c0e498 <unknown>
+
+sttp q3, q5, [sp], #0
+// CHECK-INST: sttp q3, q5, [sp], #0
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x80,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ec8017e3 <unknown>
+
+sttp q17, q19, [sp], #1008
+// CHECK-INST: sttp q17, q19, [sp], #1008
+// CHECK-ENCODING: encoding: [0xf1,0xcf,0x9f,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ec9fcff1 <unknown>
+
+ldtp q23, q29, [x1], #-1024
+// CHECK-INST: ldtp q23, q29, [x1], #-1024
+// CHECK-ENCODING: encoding: [0x37,0x74,0xe0,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ece07437 <unknown>
 
 //------------------------------------------------------------------------------
 // Load/store register pair (pre-indexed)
 //------------------------------------------------------------------------------
-  ldtp       x21, x29, [x2, #504]!
-// CHECK: ldtp	x21, x29, [x2, #504]!           // encoding: [0x55,0xf4,0xdf,0xe9]
-// ERROR: instruction requires: lsui
-  ldtp       x22, x23, [x3, #-512]!
-// CHECK: ldtp	x22, x23, [x3, #-512]!          // encoding: [0x76,0x5c,0xe0,0xe9]
-// ERROR: instruction requires: lsui
-  ldtp       x24, x25, [x4, #8]!
-// CHECK: ldtp	x24, x25, [x4, #8]!             // encoding: [0x98,0xe4,0xc0,0xe9]
-// ERROR: instruction requires: lsui
-
-  sttp       q3, q5, [sp, #0]!
-// CHECK: sttp	q3, q5, [sp, #0]!               // encoding: [0xe3,0x17,0x80,0xed]
-// ERROR: instruction requires: lsui
-  sttp       q17, q19, [sp, #1008]!
-// CHECK: sttp	q17, q19, [sp, #1008]!          // encoding: [0xf1,0xcf,0x9f,0xed]
-// ERROR: instruction requires: lsui
-  ldtp       q23, q29, [x1, #-1024]!
-// CHECK: ldtp	q23, q29, [x1, #-1024]!         // encoding: [0x37,0x74,0xe0,0xed]
-// ERROR: instruction requires: lsui
+ldtp x21, x29, [x2, #504]!
+// CHECK-INST: ldtp x21, x29, [x2, #504]!
+// CHECK-ENCODING: encoding: [0x55,0xf4,0xdf,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e9dff455 <unknown>
+
+ldtp x22, x23, [x3, #-512]!
+// CHECK-INST: ldtp x22, x23, [x3, #-512]!
+// CHECK-ENCODING: encoding: [0x76,0x5c,0xe0,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e9e05c76 <unknown>
+
+ldtp x24, x25, [x4, #8]!
+// CHECK-INST: ldtp x24, x25, [x4, #8]!
+// CHECK-ENCODING: encoding: [0x98,0xe4,0xc0,0xe9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e9c0e498 <unknown>
+
+sttp q3, q5, [sp, #0]!
+// CHECK-INST: sttp q3, q5, [sp, #0]!
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x80,0xed]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ed8017e3 <unknown>
+
+sttp q17, q19, [sp, #1008]!
+// CHECK-INST: sttp q17, q19, [sp, #1008]!
+// CHECK-ENCODING: encoding: [0xf1,0xcf,0x9f,0xed]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ed9fcff1 <unknown>
+
+ldtp q23, q29, [x1, #-1024]!
+// CHECK-INST: ldtp q23, q29, [x1, #-1024]!
+// CHECK-ENCODING: encoding: [0x37,0x74,0xe0,0xed]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ede07437 <unknown>
 
 //------------------------------------------------------------------------------
 // CAS(P)T instructions
 //------------------------------------------------------------------------------
   //64 bits
-  cast       x0, x1, [x2]
-// CHECK: cast	x0, x1, [x2]                    // encoding: [0x41,0x7c,0x80,0xc9]
-// ERROR: instruction requires: lsui
-  cast       x0, x1, [sp, #0]
-// CHECK: cast	x0, x1, [sp]                    // encoding: [0xe1,0x7f,0x80,0xc9]
-// ERROR: instruction requires: lsui
-  casat      x0, x1, [x2]
-// CHECK: casat	x0, x1, [x2]                    // encoding: [0x41,0x7c,0xc0,0xc9]
-// ERROR: instruction requires: lsui
-  casat      x0, x1, [sp, #0]
-// CHECK: casat	x0, x1, [sp]                    // encoding: [0xe1,0x7f,0xc0,0xc9]
-// ERROR: instruction requires: lsui
-  casalt     x0, x1, [x2]
-// CHECK: casalt	x0, x1, [x2]                    // encoding: [0x41,0xfc,0xc0,0xc9]
-// ERROR: instruction requires: lsui
-  casalt     x0, x1, [sp, #0]
-// CHECK: casalt	x0, x1, [sp]                    // encoding: [0xe1,0xff,0xc0,0xc9]
-// ERROR: instruction requires: lsui
-  caslt      x0, x1, [x2]
-// CHECK: caslt	x0, x1, [x2]                    // encoding: [0x41,0xfc,0x80,0xc9]
-// ERROR: instruction requires: lsui
-  caslt      x0, x1, [sp, #0]
-// CHECK: caslt	x0, x1, [sp]                    // encoding: [0xe1,0xff,0x80,0xc9]
-// ERROR: instruction requires: lsui
+  cast x0, x1, [x2]
+// CHECK-INST: cast x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x41,0x7c,0x80,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9807c41 <unknown>
+
+  cast x0, x1, [sp, #0]
+// CHECK-INST: cast x0, x1, [sp]
+// CHECK-ENCODING: encoding: [0xe1,0x7f,0x80,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9807fe1 <unknown>
+
+  casat x0, x1, [x2]
+// CHECK-INST: casat x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x41,0x7c,0xc0,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9c07c41 <unknown>
+
+  casat x0, x1, [sp, #0]
+// CHECK-INST: casat x0, x1, [sp]
+// CHECK-ENCODING: encoding: [0xe1,0x7f,0xc0,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9c07fe1 <unknown>
+
+  casalt x0, x1, [x2]
+// CHECK-INST: casalt x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x41,0xfc,0xc0,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9c0fc41 <unknown>
+
+  casalt x0, x1, [sp, #0]
+// CHECK-INST: casalt x0, x1, [sp]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0xc0,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c9c0ffe1 <unknown>
+
+  caslt x0, x1, [x2]
+// CHECK-INST: caslt x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x41,0xfc,0x80,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c980fc41 <unknown>
+
+  caslt x0, x1, [sp, #0]
+// CHECK-INST: caslt x0, x1, [sp]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0x80,0xc9]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  c980ffe1 <unknown>
 
   //CASP instruction
-  caspt      x0, x1, x2, x3, [x4]
-// CHECK: caspt	x0, x1, x2, x3, [x4]            // encoding: [0x82,0x7c,0x80,0x49]
-// ERROR: instruction requires: lsui
-  caspt      x0, x1, x2, x3, [sp, #0]
-// CHECK: caspt	x0, x1, x2, x3, [sp]            // encoding: [0xe2,0x7f,0x80,0x49]
-// ERROR: instruction requires: lsui
-  caspat     x0, x1, x2, x3, [x4]
-// CHECK: caspat	x0, x1, x2, x3, [x4]            // encoding: [0x82,0x7c,0xc0,0x49]
-// ERROR: instruction requires: lsui
-  caspat     x0, x1, x2, x3, [sp, #0]
-// CHECK: caspat	x0, x1, x2, x3, [sp]            // encoding: [0xe2,0x7f,0xc0,0x49]
-// ERROR: instruction requires: lsui
-  casplt     x0, x1, x2, x3, [x4]
-// CHECK: casplt	x0, x1, x2, x3, [x4]            // encoding: [0x82,0xfc,0x80,0x49]
-// ERROR: instruction requires: lsui
-  casplt     x0, x1, x2, x3, [sp, #0]
-// CHECK: casplt	x0, x1, x2, x3, [sp]            // encoding: [0xe2,0xff,0x80,0x49]
-// ERROR: instruction requires: lsui
-  caspalt    x0, x1, x2, x3, [x4]
-// CHECK: caspalt	x0, x1, x2, x3, [x4]            // encoding: [0x82,0xfc,0xc0,0x49]
-// ERROR: instruction requires: lsui
-  caspalt    x0, x1, x2, x3, [sp, #0]
-// CHECK: caspalt	x0, x1, x2, x3, [sp]            // encoding: [0xe2,0xff,0xc0,0x49]
-// ERROR: instruction requires: lsui
+caspt x0, x1, x2, x3, [x4]
+// CHECK-INST: caspt x0, x1, x2, x3, [x4]
+// CHECK-ENCODING: encoding: [0x82,0x7c,0x80,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49807c82 <unknown>
+
+caspt x0, x1, x2, x3, [sp, #0]
+// CHECK-INST: caspt x0, x1, x2, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe2,0x7f,0x80,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49807fe2 <unknown>
+
+caspat x0, x1, x2, x3, [x4]
+// CHECK-INST: caspat x0, x1, x2, x3, [x4]
+// CHECK-ENCODING: encoding: [0x82,0x7c,0xc0,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49c07c82 <unknown>
+
+caspat x0, x1, x2, x3, [sp, #0]
+// CHECK-INST: caspat x0, x1, x2, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe2,0x7f,0xc0,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49c07fe2 <unknown>
+
+casplt x0, x1, x2, x3, [x4]
+// CHECK-INST: casplt x0, x1, x2, x3, [x4]
+// CHECK-ENCODING: encoding: [0x82,0xfc,0x80,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  4980fc82 <unknown>
+
+casplt x0, x1, x2, x3, [sp, #0]
+// CHECK-INST: casplt x0, x1, x2, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0x80,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  4980ffe2 <unknown>
+
+caspalt x0, x1, x2, x3, [x4]
+// CHECK-INST: caspalt x0, x1, x2, x3, [x4]
+// CHECK-ENCODING: encoding: [0x82,0xfc,0xc0,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49c0fc82 <unknown>
+
+caspalt x0, x1, x2, x3, [sp, #0]
+// CHECK-INST: caspalt x0, x1, x2, x3, [sp]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0xc0,0x49]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  49c0ffe2 <unknown>
 
 //------------------------------------------------------------------------------
 // SWP(A|L)T instructions
 //------------------------------------------------------------------------------
-  swpt       w7, wzr, [x5]
-// CHECK: swpt	w7, wzr, [x5]                   // encoding: [0xbf,0x84,0x27,0x19]
-// ERROR: instruction requires: lsui
-  swpt       x9, xzr, [sp]
-// CHECK: swpt	x9, xzr, [sp]                   // encoding: [0xff,0x87,0x29,0x59]
-// ERROR: instruction requires: lsui
-
-  swpta      w7, wzr, [x5]
-// CHECK: swpta	w7, wzr, [x5]                   // encoding: [0xbf,0x84,0xa7,0x19]
-// ERROR: instruction requires: lsui
-  swpta      x9, xzr, [sp]
-// CHECK: swpta	x9, xzr, [sp]                   // encoding: [0xff,0x87,0xa9,0x59]
-// ERROR: instruction requires: lsui
-
-  swptl      w7, wzr, [x5]
-// CHECK: swptl	w7, wzr, [x5]                   // encoding: [0xbf,0x84,0x67,0x19]
-// ERROR: instruction requires: lsui
-  swptl      x9, xzr, [sp]
-// CHECK: swptl	x9, xzr, [sp]                   // encoding: [0xff,0x87,0x69,0x59]
-// ERROR: instruction requires: lsui
-
-  swptal     w7, wzr, [x5]
-// CHECK: swptal	w7, wzr, [x5]                   // encoding: [0xbf,0x84,0xe7,0x19]
-// ERROR: instruction requires: lsui
-  swptal     x9, xzr, [sp]
-// CHECK: swptal	x9, xzr, [sp]                   // encoding: [0xff,0x87,0xe9,0x59]
-// ERROR: instruction requires: lsui
+swpt w7, wzr, [x5]
+// CHECK-INST: swpt w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x84,0x27,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192784bf <unknown>
+
+swpt x9, xzr, [sp]
+// CHECK-INST: swpt x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x87,0x29,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592987ff <unknown>
+
+swpta w7, wzr, [x5]
+// CHECK-INST: swpta w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x84,0xa7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19a784bf <unknown>
+
+swpta x9, xzr, [sp]
+// CHECK-INST: swpta x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x87,0xa9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59a987ff <unknown>
+
+swptl w7, wzr, [x5]
+// CHECK-INST: swptl w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x84,0x67,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196784bf <unknown>
+
+swptl x9, xzr, [sp]
+// CHECK-INST: swptl x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x87,0x69,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596987ff <unknown>
+
+swptal w7, wzr, [x5]
+// CHECK-INST: swptal w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x84,0xe7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19e784bf <unknown>
+
+swptal x9, xzr, [sp]
+// CHECK-INST: swptal x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x87,0xe9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59e987ff <unknown>
 
 //------------------------------------------------------------------------------
 // LD{ADD|CLR|SET)(A|L|AL)T instructions
 //------------------------------------------------------------------------------
 
-  ldtadd     w7, wzr, [x5]
-// CHECK: sttadd	w7, [x5]                   // encoding: [0xbf,0x04,0x27,0x19]
-// ERROR: instruction requires: lsui
-  ldtadd     x9, xzr, [sp]
-// CHECK: sttadd	x9, [sp]                   // encoding: [0xff,0x07,0x29,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtadda    w7, wzr, [x5]
-// CHECK: ldtadda	w7, wzr, [x5]                   // encoding: [0xbf,0x04,0xa7,0x19]
-// ERROR: instruction requires: lsui
-  ldtadda    x9, xzr, [sp]
-// CHECK: ldtadda	x9, xzr, [sp]                   // encoding: [0xff,0x07,0xa9,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtaddl    w7, wzr, [x5]
-// CHECK: sttaddl	w7, [x5]                   // encoding: [0xbf,0x04,0x67,0x19]
-// ERROR: instruction requires: lsui
-  ldtaddl    x9, xzr, [sp]
-// CHECK: sttaddl	x9, [sp]                   // encoding: [0xff,0x07,0x69,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtaddal   w7, wzr, [x5]
-// CHECK: ldtaddal	w7, wzr, [x5]                   // encoding: [0xbf,0x04,0xe7,0x19]
-// ERROR: instruction requires: lsui
-  ldtaddal   x9, xzr, [sp]
-// CHECK: ldtaddal	x9, xzr, [sp]                   // encoding: [0xff,0x07,0xe9,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtclr     w7, wzr, [x5]
-// CHECK: sttclr	w7, [x5]                   // encoding: [0xbf,0x14,0x27,0x19]
-// ERROR: instruction requires: lsui
-  ldtclr     x9, xzr, [sp]
-// CHECK: sttclr	x9, [sp]                   // encoding: [0xff,0x17,0x29,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtclrl    w7, wzr, [x5]
-// CHECK: sttclrl	w7, [x5]                   // encoding: [0xbf,0x14,0x67,0x19]
-// ERROR: instruction requires: lsui
-  ldtclrl    x9, xzr, [sp]
-// CHECK: sttclrl	x9, [sp]                   // encoding: [0xff,0x17,0x69,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtclra    w7, wzr, [x5]
-// CHECK: ldtclra	w7, wzr, [x5]                   // encoding: [0xbf,0x14,0xa7,0x19]
-// ERROR: instruction requires: lsui
-  ldtclra    x9, xzr, [sp]
-// CHECK: ldtclra	x9, xzr, [sp]                   // encoding: [0xff,0x17,0xa9,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtclral   w7, wzr, [x5]
-// CHECK: ldtclral	w7, wzr, [x5]                   // encoding: [0xbf,0x14,0xe7,0x19]
-// ERROR: instruction requires: lsui
-  ldtclral   x9, xzr, [sp]
-// CHECK: ldtclral	x9, xzr, [sp]                   // encoding: [0xff,0x17,0xe9,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtset     w7, wzr, [x5]
-// CHECK: sttset	w7, [x5]                   // encoding: [0xbf,0x34,0x27,0x19]
-// ERROR: instruction requires: lsui
-  ldtset     x9, xzr, [sp]
-// CHECK: sttset	x9, [sp]                   // encoding: [0xff,0x37,0x29,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtsetl    w7, wzr, [x5]
-// CHECK: sttsetl	w7, [x5]                   // encoding: [0xbf,0x34,0x67,0x19]
-// ERROR: instruction requires: lsui
-  ldtsetl    x9, xzr, [sp]
-// CHECK: sttsetl	x9, [sp]                   // encoding: [0xff,0x37,0x69,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtseta    w7, wzr, [x5]
-// CHECK: ldtseta	w7, wzr, [x5]                   // encoding: [0xbf,0x34,0xa7,0x19]
-// ERROR: instruction requires: lsui
-  ldtseta    x9, xzr, [sp]
-// CHECK: ldtseta	x9, xzr, [sp]                   // encoding: [0xff,0x37,0xa9,0x59]
-// ERROR: instruction requires: lsui
-
-  ldtsetal   w7, wzr, [x5]
-// CHECK: ldtsetal	w7, wzr, [x5]                   // encoding: [0xbf,0x34,0xe7,0x19]
-// ERROR: instruction requires: lsui
-  ldtsetal   x9, xzr, [sp]
-// CHECK: ldtsetal	x9, xzr, [sp]                   // encoding: [0xff,0x37,0xe9,0x59]
-// ERROR: instruction requires: lsui
+ldtadd w7, wzr, [x5]
+// CHECK-INST: sttadd w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x04,0x27,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192704bf <unknown>
+
+ldtadd x9, xzr, [sp]
+// CHECK-INST: sttadd x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x29,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592907ff <unknown>
+
+ldtadda w7, wzr, [x5]
+// CHECK-INST: ldtadda w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x04,0xa7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19a704bf <unknown>
+
+ldtadda x9, xzr, [sp]
+// CHECK-INST: ldtadda x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0xa9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59a907ff <unknown>
+
+ldtaddl w7, wzr, [x5]
+// CHECK-INST: sttaddl w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x04,0x67,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196704bf <unknown>
+
+ldtaddl x9, xzr, [sp]
+// CHECK-INST: sttaddl x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x69,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596907ff <unknown>
+
+ldtaddal w7, wzr, [x5]
+// CHECK-INST: ldtaddal w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x04,0xe7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19e704bf <unknown>
+
+ldtaddal x9, xzr, [sp]
+// CHECK-INST: ldtaddal x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0xe9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59e907ff <unknown>
+
+ldtclr w7, wzr, [x5]
+// CHECK-INST: sttclr w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x14,0x27,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192714bf <unknown>
+
+ldtclr x9, xzr, [sp]
+// CHECK-INST: sttclr x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x29,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592917ff <unknown>
+
+ldtclrl w7, wzr, [x5]
+// CHECK-INST: sttclrl w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x14,0x67,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196714bf <unknown>
+
+ldtclrl x9, xzr, [sp]
+// CHECK-INST: sttclrl x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x69,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596917ff <unknown>
+
+ldtclra w7, wzr, [x5]
+// CHECK-INST: ldtclra w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x14,0xa7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19a714bf <unknown>
+
+ldtclra x9, xzr, [sp]
+// CHECK-INST: ldtclra x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0xa9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59a917ff <unknown>
+
+ldtclral w7, wzr, [x5]
+// CHECK-INST: ldtclral w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x14,0xe7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19e714bf <unknown>
+
+ldtclral x9, xzr, [sp]
+// CHECK-INST: ldtclral x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0xe9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59e917ff <unknown>
+
+ldtset w7, wzr, [x5]
+// CHECK-INST: sttset w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x34,0x27,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192734bf <unknown>
+
+ldtset x9, xzr, [sp]
+// CHECK-INST: sttset x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x29,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592937ff <unknown>
+
+ldtsetl w7, wzr, [x5]
+// CHECK-INST: sttsetl w7, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x34,0x67,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196734bf <unknown>
+
+ldtsetl x9, xzr, [sp]
+// CHECK-INST: sttsetl x9, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x69,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596937ff <unknown>
+
+ldtseta w7, wzr, [x5]
+// CHECK-INST: ldtseta w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x34,0xa7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19a734bf <unknown>
+
+ldtseta x9, xzr, [sp]
+// CHECK-INST: ldtseta x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0xa9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59a937ff <unknown>
+
+ldtsetal w7, wzr, [x5]
+// CHECK-INST: ldtsetal w7, wzr, [x5]
+// CHECK-ENCODING: encoding: [0xbf,0x34,0xe7,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  19e734bf <unknown>
+
+ldtsetal x9, xzr, [sp]
+// CHECK-INST: ldtsetal x9, xzr, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0xe9,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  59e937ff <unknown>
 
 //------------------------------------------------------------------------------
 // ST{ADD|CLR|SET)(A|L|AL)T instructions
 //------------------------------------------------------------------------------
 
-  sttadd     w0, [x2]
-// CHECK: sttadd	w0, [x2]                   // encoding: [0x5f,0x04,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttadd     w2, [sp]
-// CHECK: sttadd	w2, [sp]                   // encoding: [0xff,0x07,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttadd     x0, [x2]
-// CHECK: sttadd	x0, [x2]                   // encoding: [0x5f,0x04,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttadd     x2, [sp]
-// CHECK: sttadd	x2, [sp]                   // encoding: [0xff,0x07,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttaddl    w0, [x2]
-// CHECK: sttaddl	w0, [x2]                   // encoding: [0x5f,0x04,0x60,0x19]
-// ERROR: instruction requires: lsui
-  sttaddl    w2, [sp]
-// CHECK: sttaddl	w2, [sp]                   // encoding: [0xff,0x07,0x62,0x19]
-// ERROR: instruction requires: lsui
-  sttaddl    x0, [x2]
-// CHECK: sttaddl	x0, [x2]                   // encoding: [0x5f,0x04,0x60,0x59]
-// ERROR: instruction requires: lsui
-  sttaddl    x2, [sp]
-// CHECK: sttaddl	x2, [sp]                   // encoding: [0xff,0x07,0x62,0x59]
-// ERROR: instruction requires: lsui
-
-  sttclr     w0, [x2]
-// CHECK: sttclr	w0, [x2]                   // encoding: [0x5f,0x14,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttclr     w2, [sp]
-// CHECK: sttclr	w2, [sp]                   // encoding: [0xff,0x17,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttclr     x0, [x2]
-// CHECK: sttclr	x0, [x2]                   // encoding: [0x5f,0x14,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttclr     x2, [sp]
-// CHECK: sttclr	x2, [sp]                   // encoding: [0xff,0x17,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttclrl    w0, [x2]
-// CHECK: sttclrl	w0, [x2]                   // encoding: [0x5f,0x14,0x60,0x19]
-// ERROR: instruction requires: lsui
-  sttclrl    w2, [sp]
-// CHECK: sttclrl	w2, [sp]                   // encoding: [0xff,0x17,0x62,0x19]
-// ERROR: instruction requires: lsui
-  sttclrl    x0, [x2]
-// CHECK: sttclrl	x0, [x2]                   // encoding: [0x5f,0x14,0x60,0x59]
-// ERROR: instruction requires: lsui
-  sttclrl    x2, [sp]
-// CHECK: sttclrl	x2, [sp]                   // encoding: [0xff,0x17,0x62,0x59]
-// ERROR: instruction requires: lsui
-
-  sttset     w0, [x2]
-// CHECK: sttset	w0, [x2]                   // encoding: [0x5f,0x34,0x20,0x19]
-// ERROR: instruction requires: lsui
-  sttset     w2, [sp]
-// CHECK: sttset	w2, [sp]                   // encoding: [0xff,0x37,0x22,0x19]
-// ERROR: instruction requires: lsui
-  sttset     x0, [x2]
-// CHECK: sttset	x0, [x2]                   // encoding: [0x5f,0x34,0x20,0x59]
-// ERROR: instruction requires: lsui
-  sttset     x2, [sp]
-// CHECK: sttset	x2, [sp]                   // encoding: [0xff,0x37,0x22,0x59]
-// ERROR: instruction requires: lsui
-
-  sttsetl    w0, [x2]
-// CHECK: sttsetl	w0, [x2]                   // encoding: [0x5f,0x34,0x60,0x19]
-// ERROR: instruction requires: lsui
-  sttsetl    w2, [sp]
-// CHECK: sttsetl	w2, [sp]                   // encoding: [0xff,0x37,0x62,0x19]
-// ERROR: instruction requires: lsui
-  sttsetl    x0, [x2]
-// CHECK: sttsetl	x0, [x2]                   // encoding: [0x5f,0x34,0x60,0x59]
-// ERROR: instruction requires: lsui
-  sttsetl    x2, [sp]
-// CHECK: sttsetl	x2, [sp]                   // encoding: [0xff,0x37,0x62,0x59]
-// ERROR: instruction requires: lsui
+sttadd w0, [x2]
+// CHECK-INST: sttadd w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x04,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1920045f <unknown>
+
+sttadd w2, [sp]
+// CHECK-INST: sttadd w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x22,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192207ff <unknown>
+
+sttadd x0, [x2]
+// CHECK-INST: sttadd x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x04,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5920045f <unknown>
+
+sttadd x2, [sp]
+// CHECK-INST: sttadd x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x22,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592207ff <unknown>
+
+sttaddl w0, [x2]
+// CHECK-INST: sttaddl w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x04,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1960045f <unknown>
+
+sttaddl w2, [sp]
+// CHECK-INST: sttaddl w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x62,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196207ff <unknown>
+
+sttaddl x0, [x2]
+// CHECK-INST: sttaddl x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x04,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5960045f <unknown>
+
+sttaddl x2, [sp]
+// CHECK-INST: sttaddl x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x62,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596207ff <unknown>
+
+sttclr w0, [x2]
+// CHECK-INST: sttclr w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x14,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1920145f <unknown>
+
+sttclr w2, [sp]
+// CHECK-INST: sttclr w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x22,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192217ff <unknown>
+
+sttclr x0, [x2]
+// CHECK-INST: sttclr x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x14,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5920145f <unknown>
+
+sttclr x2, [sp]
+// CHECK-INST: sttclr x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x22,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592217ff <unknown>
+
+sttclrl w0, [x2]
+// CHECK-INST: sttclrl w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x14,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1960145f <unknown>
+
+sttclrl w2, [sp]
+// CHECK-INST: sttclrl w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x62,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196217ff <unknown>
+
+sttclrl x0, [x2]
+// CHECK-INST: sttclrl x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x14,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5960145f <unknown>
+
+sttclrl x2, [sp]
+// CHECK-INST: sttclrl x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x17,0x62,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596217ff <unknown>
+
+sttset w0, [x2]
+// CHECK-INST: sttset w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x34,0x20,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1920345f <unknown>
+
+sttset w2, [sp]
+// CHECK-INST: sttset w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x22,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  192237ff <unknown>
+
+sttset x0, [x2]
+// CHECK-INST: sttset x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x34,0x20,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5920345f <unknown>
+
+sttset x2, [sp]
+// CHECK-INST: sttset x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x22,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  592237ff <unknown>
+
+sttsetl w0, [x2]
+// CHECK-INST: sttsetl w0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x34,0x60,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  1960345f <unknown>
+
+sttsetl w2, [sp]
+// CHECK-INST: sttsetl w2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x62,0x19]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  196237ff <unknown>
+
+sttsetl x0, [x2]
+// CHECK-INST: sttsetl x0, [x2]
+// CHECK-ENCODING: encoding: [0x5f,0x34,0x60,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  5960345f <unknown>
+
+sttsetl x2, [sp]
+// CHECK-INST: sttsetl x2, [sp]
+// CHECK-ENCODING: encoding: [0xff,0x37,0x62,0x59]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  596237ff <unknown>
 
 //------------------------------------------------------------------------------
 // Load/store non-temporal register pair (offset)
 //------------------------------------------------------------------------------
-  ldtnp      x21, x29, [x2, #504]
-// CHECK: ldtnp	x21, x29, [x2, #504]            // encoding: [0x55,0xf4,0x5f,0xe8]
-// ERROR: instruction requires: lsui
-  ldtnp      x22, x23, [x3, #-512]
-// CHECK: ldtnp	x22, x23, [x3, #-512]           // encoding: [0x76,0x5c,0x60,0xe8]
-// ERROR: instruction requires: lsui
-  ldtnp      x24, x25, [x4, #8]
-// CHECK: ldtnp	x24, x25, [x4, #8]              // encoding: [0x98,0xe4,0x40,0xe8]
-// ERROR: instruction requires: lsui
-  ldtnp      q23, q29, [x1, #-1024]
-// CHECK: ldtnp	q23, q29, [x1, #-1024]          // encoding: [0x37,0x74,0x60,0xec]
-// ERROR: instruction requires: lsui
-
-  sttnp      x3, x5, [sp]
-// CHECK: sttnp	x3, x5, [sp]                    // encoding: [0xe3,0x17,0x00,0xe8]
-// ERROR: instruction requires: lsui
-  sttnp      x17, x19, [sp, #64]
-// CHECK: sttnp	x17, x19, [sp, #64]             // encoding: [0xf1,0x4f,0x04,0xe8]
-// ERROR: instruction requires: lsui
-  sttnp      q3, q5, [sp]
-// CHECK: sttnp	q3, q5, [sp]                    // encoding: [0xe3,0x17,0x00,0xec]
-// ERROR: instruction requires: lsui
-  sttnp      q17, q19, [sp, #1008]
-// CHECK: sttnp	q17, q19, [sp, #1008]           // encoding: [0xf1,0xcf,0x1f,0xec]
-// ERROR: instruction requires: lsui
-
+ldtnp x21, x29, [x2, #504]
+// CHECK-INST: ldtnp x21, x29, [x2, #504]
+// CHECK-ENCODING: encoding: [0x55,0xf4,0x5f,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e85ff455 <unknown>
+
+ldtnp x22, x23, [x3, #-512]
+// CHECK-INST: ldtnp x22, x23, [x3, #-512]
+// CHECK-ENCODING: encoding: [0x76,0x5c,0x60,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e8605c76 <unknown>
+
+ldtnp x24, x25, [x4, #8]
+// CHECK-INST: ldtnp x24, x25, [x4, #8]
+// CHECK-ENCODING: encoding: [0x98,0xe4,0x40,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e840e498 <unknown>
+
+ldtnp q23, q29, [x1, #-1024]
+// CHECK-INST: ldtnp q23, q29, [x1, #-1024]
+// CHECK-ENCODING: encoding: [0x37,0x74,0x60,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ec607437 <unknown>
+
+sttnp x3, x5, [sp]
+// CHECK-INST: sttnp x3, x5, [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x00,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e80017e3 <unknown>
+
+sttnp x17, x19, [sp, #64]
+// CHECK-INST: sttnp x17, x19, [sp, #64]
+// CHECK-ENCODING: encoding: [0xf1,0x4f,0x04,0xe8]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  e8044ff1 <unknown>
+
+sttnp q3, q5, [sp]
+// CHECK-INST: sttnp q3, q5, [sp]
+// CHECK-ENCODING: encoding: [0xe3,0x17,0x00,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ec0017e3 <unknown>
+
+sttnp q17, q19, [sp, #1008]
+// CHECK-INST: sttnp q17, q19, [sp, #1008]
+// CHECK-ENCODING: encoding: [0xf1,0xcf,0x1f,0xec]
+// CHECK-ERROR: error: instruction requires: lsui
+// CHECK-UNKNOWN:  ec1fcff1 <unknown>
diff --git a/llvm/test/MC/AArch64/armv9.6a-mpam-diagnostics.s b/llvm/test/MC/AArch64/armv9.6a-mpam-diagnostics.s
new file mode 100644
index 0000000..a39eaef
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.6a-mpam-diagnostics.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+
+msr MPAMBWIDR_EL1, x0
+// CHECK-ERROR: error: expected writable system register or pstate
+\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/armv9.6a-mpam.s b/llvm/test/MC/AArch64/armv9.6a-mpam.s
index c0696ef..82603ca 100644
--- a/llvm/test/MC/AArch64/armv9.6a-mpam.s
+++ b/llvm/test/MC/AArch64/armv9.6a-mpam.s
@@ -1,45 +1,94 @@
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-RO < %t %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 //------------------------------------------------------------------------------
 // Armv9.6-A FEAT_MPAM Extensions
 //------------------------------------------------------------------------------
 
-msr MPAMBWIDR_EL1, x0
 msr MPAMBW3_EL3, x0
+// CHECK-INST: msr MPAMBW3_EL3, x0
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51ea580 msr MPAMBW3_EL3, x0
+
 msr MPAMBW2_EL2, x0
+// CHECK-INST: msr MPAMBW2_EL2, x0
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51ca580 msr MPAMBW2_EL2, x0
+
 msr MPAMBW1_EL1, x0
+// CHECK-INST: msr MPAMBW1_EL1, x0
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN:  d518a580 msr MPAMBW1_EL1, x0
+
 msr MPAMBW1_EL12, x0
+// CHECK-INST: msr MPAMBW1_EL12, x0
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51da580 msr MPAMBW1_EL12, x0
+
 msr MPAMBW0_EL1, x0
+// CHECK-INST: msr MPAMBW0_EL1, x0
+// CHECK-ENCODING: encoding: [0xa0,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN:  d518a5a0 msr MPAMBW0_EL1, x0
+
 msr MPAMBWCAP_EL2, x0
+// CHECK-INST: msr MPAMBWCAP_EL2, x0
+// CHECK-ENCODING: encoding: [0xc0,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51ca5c0 msr MPAMBWCAP_EL2, x0
+
 msr MPAMBWSM_EL1, x0
+// CHECK-INST: msr MPAMBWSM_EL1, x0
+// CHECK-ENCODING: encoding: [0xe0,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN:  d518a5e0 msr MPAMBWSM_EL1, x0
 
 mrs x0, MPAMBWIDR_EL1
+// CHECK-INST: mrs x0, MPAMBWIDR_EL1
+// CHECK-ENCODING: encoding: [0xa0,0xa4,0x38,0xd5]
+// CHECK-UNKNOWN:  d538a4a0 mrs x0, MPAMBWIDR_EL1
+
 mrs x0, MPAMBW3_EL3
+// CHECK-INST: mrs x0, MPAMBW3_EL3
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53ea580 mrs x0, MPAMBW3_EL3
+
 mrs x0, MPAMBW2_EL2
+// CHECK-INST: mrs x0, MPAMBW2_EL2
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53ca580 mrs x0, MPAMBW2_EL2
+
 mrs x0, MPAMBW1_EL1
+// CHECK-INST: mrs x0, MPAMBW1_EL1
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN:  d538a580 mrs x0, MPAMBW1_EL1
+
 mrs x0, MPAMBW1_EL12
+// CHECK-INST: mrs x0, MPAMBW1_EL12
+// CHECK-ENCODING: encoding: [0x80,0xa5,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53da580 mrs x0, MPAMBW1_EL12
+
 mrs x0, MPAMBW0_EL1
+// CHECK-INST: mrs x0, MPAMBW0_EL1
+// CHECK-ENCODING: encoding: [0xa0,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN:  d538a5a0 mrs x0, MPAMBW0_EL1
+
 mrs x0, MPAMBWCAP_EL2
+// CHECK-INST: mrs x0, MPAMBWCAP_EL2
+// CHECK-ENCODING: encoding: [0xc0,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53ca5c0 mrs x0, MPAMBWCAP_EL2
+
 mrs x0, MPAMBWSM_EL1
+// CHECK-INST: mrs x0, MPAMBWSM_EL1
+// CHECK-ENCODING: encoding: [0xe0,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN:  d538a5e0 mrs x0, MPAMBWSM_EL1
+
 
-//CHECK: msr     MPAMBW3_EL3, x0                 // encoding: [0x80,0xa5,0x1e,0xd5]
-//CHECK: msr     MPAMBW2_EL2, x0                 // encoding: [0x80,0xa5,0x1c,0xd5]
-//CHECK: msr     MPAMBW1_EL1, x0                 // encoding: [0x80,0xa5,0x18,0xd5]
-//CHECK: msr     MPAMBW1_EL12, x0                // encoding: [0x80,0xa5,0x1d,0xd5]
-//CHECK: msr     MPAMBW0_EL1, x0                 // encoding: [0xa0,0xa5,0x18,0xd5]
-//CHECK: msr     MPAMBWCAP_EL2, x0               // encoding: [0xc0,0xa5,0x1c,0xd5]
-//CHECK: msr     MPAMBWSM_EL1, x0                // encoding: [0xe0,0xa5,0x18,0xd5]
-
-//CHECK-RO: error: expected writable system register or pstate
-//CHECK-RO: msr MPAMBWIDR_EL1, x0
-//CHECK-RO:     ^
-
-//CHECK: mrs     x0, MPAMBWIDR_EL1               // encoding: [0xa0,0xa4,0x38,0xd5]
-//CHECK: mrs     x0, MPAMBW3_EL3                 // encoding: [0x80,0xa5,0x3e,0xd5]
-//CHECK: mrs     x0, MPAMBW2_EL2                 // encoding: [0x80,0xa5,0x3c,0xd5]
-//CHECK: mrs     x0, MPAMBW1_EL1                 // encoding: [0x80,0xa5,0x38,0xd5]
-//CHECK: mrs     x0, MPAMBW1_EL12                // encoding: [0x80,0xa5,0x3d,0xd5]
-//CHECK: mrs     x0, MPAMBW0_EL1                 // encoding: [0xa0,0xa5,0x38,0xd5]
-//CHECK: mrs     x0, MPAMBWCAP_EL2               // encoding: [0xc0,0xa5,0x3c,0xd5]
-//CHECK: mrs     x0, MPAMBWSM_EL1                // encoding: [0xe0,0xa5,0x38,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.6a-occmo.s b/llvm/test/MC/AArch64/armv9.6a-occmo.s
index d6548f9..9f25642 100644
--- a/llvm/test/MC/AArch64/armv9.6a-occmo.s
+++ b/llvm/test/MC/AArch64/armv9.6a-occmo.s
@@ -1,17 +1,39 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+occmo -mattr=+mte %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding %s -mattr=+mte 2>&1 | FileCheck --check-prefix=ERROR %s
-.func:
-// CHECK: .func:
-  dc civaoc, x12
-// CHECK: dc	civaoc, x12                     // encoding: [0x0c,0x7f,0x0b,0xd5]
-// ERROR: error: DC CIVAOC requires: occmo
-  dc cigdvaoc, x0
-// CHECK: dc	cigdvaoc, x0                    // encoding: [0xe0,0x7f,0x0b,0xd5]
-// ERROR: error: DC CIGDVAOC requires: mte, memtag, occmo
-  dc cvaoc, x13
-// CHECK: dc	cvaoc, x13                      // encoding: [0x0d,0x7b,0x0b,0xd5]
-// ERROR: error: DC CVAOC requires: occmo
-  dc cgdvaoc, x1
-// CHECK: dc	cgdvaoc, x1                     // encoding: [0xe1,0x7b,0x0b,0xd5]
-// ERROR: error: DC CGDVAOC requires: mte, memtag, occmo
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+occmo,+mte,+memtag < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+occmo,+mte,+memtag < %s \
+// RUN:        | llvm-objdump -d --mattr=+occmo,+mte,+memtag --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+occmo,+mte,+memtag < %s \
+// RUN:        | llvm-objdump -d --mattr=-occmo,-mte,-memtag --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+occmo,+mte,+memtag < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+occmo,+mte,+memtag -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
+
+
+dc civaoc, x12
+// CHECK-INST: dc civaoc, x12
+// CHECK-ENCODING: encoding: [0x0c,0x7f,0x0b,0xd5]
+// CHECK-ERROR: error: DC CIVAOC requires: occmo
+// CHECK-UNKNOWN:  d50b7f0c      sys #3, c7, c15, #0, x12
+
+dc cigdvaoc, x0
+// CHECK-INST: dc cigdvaoc, x0
+// CHECK-ENCODING: encoding: [0xe0,0x7f,0x0b,0xd5]
+// CHECK-ERROR: error: DC CIGDVAOC requires: mte, memtag, occmo
+// CHECK-UNKNOWN:  d50b7fe0      sys #3, c7, c15, #7, x0
+
+dc cvaoc, x13
+// CHECK-INST: dc cvaoc, x13
+// CHECK-ENCODING: encoding: [0x0d,0x7b,0x0b,0xd5]
+// CHECK-ERROR: error: DC CVAOC requires: occmo
+// CHECK-UNKNOWN:  d50b7b0d      sys #3, c7, c11, #0, x13
+
+dc cgdvaoc, x1
+// CHECK-INST: dc cgdvaoc, x1
+// CHECK-ENCODING: encoding: [0xe1,0x7b,0x0b,0xd5]
+// CHECK-ERROR: error: DC CGDVAOC requires: mte, memtag, occmo
+// CHECK-UNKNOWN:  d50b7be1      sys #3, c7, c11, #7, x1
diff --git a/llvm/test/MC/AArch64/armv9.6a-pcdphint.s b/llvm/test/MC/AArch64/armv9.6a-pcdphint.s
index 6314e53..8394171 100644
--- a/llvm/test/MC/AArch64/armv9.6a-pcdphint.s
+++ b/llvm/test/MC/AArch64/armv9.6a-pcdphint.s
@@ -1,13 +1,25 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+pcdphint %s | FileCheck %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding %s 2>&1 | FileCheck --check-prefix=ERROR %s
-
-.func:
-// CHECK: .func:
-  stshh keep
-// CHECK: stshh	keep                            // encoding: [0x1f,0x96,0x01,0xd5]
-// ERROR: error: instruction requires: pcdphint
-  stshh strm
-// CHECK: stshh	strm                            // encoding: [0x3f,0x96,0x01,0xd5]
-// ERROR: error: instruction requires: pcdphint
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pcdphint < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pcdphint < %s \
+// RUN:        | llvm-objdump -d --mattr=+pcdphint - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pcdphint < %s \
+// RUN:        | llvm-objdump -d --mattr=-pcdphint - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pcdphint < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+pcdphint -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
+stshh keep
+// CHECK-INST: stshh keep
+// CHECK-ENCODING: encoding: [0x1f,0x96,0x01,0xd5]
+// CHECK-ERROR: error: instruction requires: pcdphint
+// CHECK-UNKNOWN:  d501961f      msr S0_1_C9_C6_0, xzr
 
+stshh strm
+// CHECK-INST: stshh strm
+// CHECK-ENCODING: encoding: [0x3f,0x96,0x01,0xd5]
+// CHECK-ERROR: error: instruction requires: pcdphint
+// CHECK-UNKNOWN:  d501963f      msr S0_1_C9_C6_1, xzr
diff --git a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
index 093101b..2a1943a 100644
--- a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
+++ b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
@@ -1,19 +1,45 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-// RUN: llvm-mc -triple aarch64 -show-encoding %s  | FileCheck %s
-.func:
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
   apas x0
+// CHECK-INST: apas x0
+// CHECK-ENCODING: encoding: [0x00,0x70,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e7000      apas x0
+
   apas x1
+// CHECK-INST: apas x1
+// CHECK-ENCODING: encoding: [0x01,0x70,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e7001      apas x1
+
   apas x2
+// CHECK-INST: apas x2
+// CHECK-ENCODING: encoding: [0x02,0x70,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e7002      apas x2
+
   apas x17
+// CHECK-INST: apas x17
+// CHECK-ENCODING: encoding: [0x11,0x70,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e7011      apas x17
+
   apas x30
+// CHECK-INST: apas x30
+// CHECK-ENCODING: encoding: [0x1e,0x70,0x0e,0xd5]
+// CHECK-UNKNOWN:  d50e701e      apas x30
+
   mrs x3, GPCBW_EL3
-  msr GPCBW_EL3, x4
+// CHECK-INST: mrs x3, GPCBW_EL3
+// CHECK-ENCODING: encoding: [0xa3,0x21,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e21a3      mrs x3, GPCBW_EL3
 
-# CHECK:      .func:
-# CHECK-NEXT:	apas    x0                              // encoding: [0x00,0x70,0x0e,0xd5]
-# CHECK-NEXT:	apas    x1                              // encoding: [0x01,0x70,0x0e,0xd5]
-# CHECK-NEXT:	apas    x2                              // encoding: [0x02,0x70,0x0e,0xd5]
-# CHECK-NEXT:	apas    x17                             // encoding: [0x11,0x70,0x0e,0xd5]
-# CHECK-NEXT:	apas    x30                             // encoding: [0x1e,0x70,0x0e,0xd5]
-# CHECK-NEXT: 	mrs	x3, GPCBW_EL3                   // encoding: [0xa3,0x21,0x3e,0xd5]
-# CHECK-NEXT: 	msr	GPCBW_EL3, x4                   // encoding: [0xa4,0x21,0x1e,0xd5]
+  msr GPCBW_EL3, x4
+// CHECK-INST: msr GPCBW_EL3, x4
+// CHECK-ENCODING: encoding: [0xa4,0x21,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e21a4      msr GPCBW_EL3, x4
diff --git a/llvm/test/MC/AArch64/armv9.6a-srmask.s b/llvm/test/MC/AArch64/armv9.6a-srmask.s
index 40f0e98..fb91993 100644
--- a/llvm/test/MC/AArch64/armv9.6a-srmask.s
+++ b/llvm/test/MC/AArch64/armv9.6a-srmask.s
@@ -1,102 +1,254 @@
-// RUN: llvm-mc -triple aarch64 -show-encoding %s   | FileCheck %s
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
 
 mrs x3, SCTLRMASK_EL1
-// CHECK: mrs	x3, SCTLRMASK_EL1               // encoding: [0x03,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, SCTLRMASK_EL1
+// CHECK-ENCODING: encoding: [0x03,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d5381403 mrs x3, SCTLRMASK_EL1
+
 mrs x3, SCTLRMASK_EL2
-// CHECK: mrs	x3, SCTLRMASK_EL2               // encoding: [0x03,0x14,0x3c,0xd5]
+// CHECK-INST: mrs x3, SCTLRMASK_EL2
+// CHECK-ENCODING: encoding: [0x03,0x14,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c1403 mrs x3, SCTLRMASK_EL2
+
 mrs x3, SCTLRMASK_EL12
-// CHECK: mrs	x3, SCTLRMASK_EL12              // encoding: [0x03,0x14,0x3d,0xd5]
+// CHECK-INST: mrs x3, SCTLRMASK_EL12
+// CHECK-ENCODING: encoding: [0x03,0x14,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d1403 mrs x3, SCTLRMASK_EL12
+
 mrs x3, CPACRMASK_EL1
-// CHECK: mrs	x3, CPACRMASK_EL1               // encoding: [0x43,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, CPACRMASK_EL1
+// CHECK-ENCODING: encoding: [0x43,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d5381443 mrs x3, CPACRMASK_EL1
+
 mrs x3, CPTRMASK_EL2
-// CHECK: mrs	x3, CPTRMASK_EL2                // encoding: [0x43,0x14,0x3c,0xd5]
+// CHECK-INST: mrs x3, CPTRMASK_EL2
+// CHECK-ENCODING: encoding: [0x43,0x14,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c1443 mrs x3, CPTRMASK_EL2
+
 mrs x3, CPACRMASK_EL12
-// CHECK: mrs	x3, CPACRMASK_EL12              // encoding: [0x43,0x14,0x3d,0xd5]
+// CHECK-INST: mrs x3, CPACRMASK_EL12
+// CHECK-ENCODING: encoding: [0x43,0x14,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d1443 mrs x3, CPACRMASK_EL12
+
 mrs x3, SCTLR2MASK_EL1
-// CHECK: mrs	x3, SCTLR2MASK_EL1              // encoding: [0x63,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, SCTLR2MASK_EL1
+// CHECK-ENCODING: encoding: [0x63,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d5381463 mrs x3, SCTLR2MASK_EL1
+
 mrs x3, SCTLR2MASK_EL2
-// CHECK: mrs	x3, SCTLR2MASK_EL2              // encoding: [0x63,0x14,0x3c,0xd5]
+// CHECK-INST: mrs x3, SCTLR2MASK_EL2
+// CHECK-ENCODING: encoding: [0x63,0x14,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c1463 mrs x3, SCTLR2MASK_EL2
+
 mrs x3, SCTLR2MASK_EL12
-// CHECK: mrs	x3, SCTLR2MASK_EL12             // encoding: [0x63,0x14,0x3d,0xd5]
+// CHECK-INST: mrs x3, SCTLR2MASK_EL12
+// CHECK-ENCODING: encoding: [0x63,0x14,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d1463 mrs x3, SCTLR2MASK_EL12
+
 mrs x3, CPACRALIAS_EL1
-// CHECK: mrs	x3, CPACRALIAS_EL1              // encoding: [0x83,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, CPACRALIAS_EL1
+// CHECK-ENCODING: encoding: [0x83,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d5381483 mrs x3, CPACRALIAS_EL1
+
 mrs x3, SCTLRALIAS_EL1
-// CHECK: mrs	x3, SCTLRALIAS_EL1              // encoding: [0xc3,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, SCTLRALIAS_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d53814c3 mrs x3, SCTLRALIAS_EL1
+
 mrs x3, SCTLR2ALIAS_EL1
-// CHECK: mrs	x3, SCTLR2ALIAS_EL1             // encoding: [0xe3,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, SCTLR2ALIAS_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d53814e3 mrs x3, SCTLR2ALIAS_EL1
+
 mrs x3, TCRMASK_EL1
-// CHECK: mrs	x3, TCRMASK_EL1                 // encoding: [0x43,0x27,0x38,0xd5]
+// CHECK-INST: mrs x3, TCRMASK_EL1
+// CHECK-ENCODING: encoding: [0x43,0x27,0x38,0xd5]
+// CHECK-UNKNOWN:  d5382743 mrs x3, TCRMASK_EL1
+
 mrs x3, TCRMASK_EL2
-// CHECK: mrs	x3, TCRMASK_EL2                 // encoding: [0x43,0x27,0x3c,0xd5]
+// CHECK-INST: mrs x3, TCRMASK_EL2
+// CHECK-ENCODING: encoding: [0x43,0x27,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c2743 mrs x3, TCRMASK_EL2
+
 mrs x3, TCRMASK_EL12
-// CHECK: mrs	x3, TCRMASK_EL12                // encoding: [0x43,0x27,0x3d,0xd5]
+// CHECK-INST: mrs x3, TCRMASK_EL12
+// CHECK-ENCODING: encoding: [0x43,0x27,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d2743 mrs x3, TCRMASK_EL12
+
 mrs x3, TCR2MASK_EL1
-// CHECK: mrs	x3, TCR2MASK_EL1                // encoding: [0x63,0x27,0x38,0xd5]
+// CHECK-INST: mrs x3, TCR2MASK_EL1
+// CHECK-ENCODING: encoding: [0x63,0x27,0x38,0xd5]
+// CHECK-UNKNOWN:  d5382763 mrs x3, TCR2MASK_EL1
+
 mrs x3, TCR2MASK_EL2
-// CHECK: mrs	x3, TCR2MASK_EL2                // encoding: [0x63,0x27,0x3c,0xd5]
+// CHECK-INST: mrs x3, TCR2MASK_EL2
+// CHECK-ENCODING: encoding: [0x63,0x27,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c2763 mrs x3, TCR2MASK_EL2
+
 mrs x3, TCR2MASK_EL12
-// CHECK: mrs	x3, TCR2MASK_EL12               // encoding: [0x63,0x27,0x3d,0xd5]
+// CHECK-INST: mrs x3, TCR2MASK_EL12
+// CHECK-ENCODING: encoding: [0x63,0x27,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d2763 mrs x3, TCR2MASK_EL12
+
 mrs x3, TCRALIAS_EL1
-// CHECK: mrs	x3, TCRALIAS_EL1                // encoding: [0xc3,0x27,0x38,0xd5]
+// CHECK-INST: mrs x3, TCRALIAS_EL1
+// CHECK-ENCODING: encoding: [0xc3,0x27,0x38,0xd5]
+// CHECK-UNKNOWN:  d53827c3 mrs x3, TCRALIAS_EL1
+
 mrs x3, TCR2ALIAS_EL1
-// CHECK: mrs	x3, TCR2ALIAS_EL1               // encoding: [0xe3,0x27,0x38,0xd5]
+// CHECK-INST: mrs x3, TCR2ALIAS_EL1
+// CHECK-ENCODING: encoding: [0xe3,0x27,0x38,0xd5]
+// CHECK-UNKNOWN:  d53827e3 mrs x3, TCR2ALIAS_EL1
+
 mrs x3, ACTLRMASK_EL1
-// CHECK: mrs	x3, ACTLRMASK_EL1               // encoding: [0x23,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, ACTLRMASK_EL1
+// CHECK-ENCODING: encoding: [0x23,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d5381423 mrs x3, ACTLRMASK_EL1
+
 mrs x3, ACTLRMASK_EL2
-// CHECK: mrs	x3, ACTLRMASK_EL2               // encoding: [0x23,0x14,0x3c,0xd5]
+// CHECK-INST: mrs x3, ACTLRMASK_EL2
+// CHECK-ENCODING: encoding: [0x23,0x14,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c1423 mrs x3, ACTLRMASK_EL2
+
 mrs x3, ACTLRMASK_EL12
-// CHECK: mrs	x3, ACTLRMASK_EL12              // encoding: [0x23,0x14,0x3d,0xd5]
+// CHECK-INST: mrs x3, ACTLRMASK_EL12
+// CHECK-ENCODING: encoding: [0x23,0x14,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d1423 mrs x3, ACTLRMASK_EL12
+
 mrs x3, ACTLRALIAS_EL1
-// CHECK: mrs	x3, ACTLRALIAS_EL1              // encoding: [0xa3,0x14,0x38,0xd5]
+// CHECK-INST: mrs x3, ACTLRALIAS_EL1
+// CHECK-ENCODING: encoding: [0xa3,0x14,0x38,0xd5]
+// CHECK-UNKNOWN:  d53814a3 mrs x3, ACTLRALIAS_EL1
 
 msr SCTLRMASK_EL1, x3
-// CHECK: msr	SCTLRMASK_EL1, x3               // encoding: [0x03,0x14,0x18,0xd5]
+// CHECK-INST: msr SCTLRMASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x03,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d5181403 msr SCTLRMASK_EL1, x3
+
 msr SCTLRMASK_EL2, x3
-// CHECK: msr	SCTLRMASK_EL2, x3               // encoding: [0x03,0x14,0x1c,0xd5]
+// CHECK-INST: msr SCTLRMASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x14,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c1403 msr SCTLRMASK_EL2, x3
+
 msr SCTLRMASK_EL12, x3
-// CHECK: msr	SCTLRMASK_EL12, x3              // encoding: [0x03,0x14,0x1d,0xd5]
+// CHECK-INST: msr SCTLRMASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x03,0x14,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d1403 msr SCTLRMASK_EL12, x3
+
 msr CPACRMASK_EL1, x3
-// CHECK: msr	CPACRMASK_EL1, x3               // encoding: [0x43,0x14,0x18,0xd5]
+// CHECK-INST: msr CPACRMASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x43,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d5181443 msr CPACRMASK_EL1, x3
+
 msr CPTRMASK_EL2, x3
-// CHECK: msr	CPTRMASK_EL2, x3                // encoding: [0x43,0x14,0x1c,0xd5]
+// CHECK-INST: msr CPTRMASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x14,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c1443 msr CPTRMASK_EL2, x3
+
 msr CPACRMASK_EL12, x3
-// CHECK: msr	CPACRMASK_EL12, x3              // encoding: [0x43,0x14,0x1d,0xd5]
+// CHECK-INST: msr CPACRMASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x43,0x14,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d1443 msr CPACRMASK_EL12, x3
+
 msr SCTLR2MASK_EL1, x3
-// CHECK: msr	SCTLR2MASK_EL1, x3              // encoding: [0x63,0x14,0x18,0xd5]
+// CHECK-INST: msr SCTLR2MASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x63,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d5181463 msr SCTLR2MASK_EL1, x3
+
 msr SCTLR2MASK_EL2, x3
-// CHECK: msr	SCTLR2MASK_EL2, x3              // encoding: [0x63,0x14,0x1c,0xd5]
+// CHECK-INST: msr SCTLR2MASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x14,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c1463 msr SCTLR2MASK_EL2, x3
+
 msr SCTLR2MASK_EL12, x3
-// CHECK: msr	SCTLR2MASK_EL12, x3             // encoding: [0x63,0x14,0x1d,0xd5]
+// CHECK-INST: msr SCTLR2MASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x63,0x14,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d1463 msr SCTLR2MASK_EL12, x3
+
 msr CPACRALIAS_EL1, x3
-// CHECK: msr	CPACRALIAS_EL1, x3              // encoding: [0x83,0x14,0x18,0xd5]
+// CHECK-INST: msr CPACRALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0x83,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d5181483 msr CPACRALIAS_EL1, x3
+
 msr SCTLRALIAS_EL1, x3
-// CHECK: msr	SCTLRALIAS_EL1, x3              // encoding: [0xc3,0x14,0x18,0xd5]
+// CHECK-INST: msr SCTLRALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0xc3,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d51814c3 msr SCTLRALIAS_EL1, x3
+
 msr SCTLR2ALIAS_EL1, x3
-// CHECK: msr	SCTLR2ALIAS_EL1, x3             // encoding: [0xe3,0x14,0x18,0xd5]
+// CHECK-INST: msr SCTLR2ALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0xe3,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d51814e3 msr SCTLR2ALIAS_EL1, x3
+
 msr TCRMASK_EL1, x3
-// CHECK: msr	TCRMASK_EL1, x3                 // encoding: [0x43,0x27,0x18,0xd5]
+// CHECK-INST: msr TCRMASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x43,0x27,0x18,0xd5]
+// CHECK-UNKNOWN:  d5182743 msr TCRMASK_EL1, x3
+
 msr TCRMASK_EL2, x3
-// CHECK: msr	TCRMASK_EL2, x3                 // encoding: [0x43,0x27,0x1c,0xd5]
+// CHECK-INST: msr TCRMASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x27,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2743 msr TCRMASK_EL2, x3
+
 msr TCRMASK_EL12, x3
-// CHECK: msr	TCRMASK_EL12, x3                // encoding: [0x43,0x27,0x1d,0xd5]
+// CHECK-INST: msr TCRMASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x43,0x27,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d2743 msr TCRMASK_EL12, x3
+
 msr TCR2MASK_EL1, x3
-// CHECK: msr	TCR2MASK_EL1, x3                // encoding: [0x63,0x27,0x18,0xd5]
+// CHECK-INST: msr TCR2MASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x63,0x27,0x18,0xd5]
+// CHECK-UNKNOWN:  d5182763 msr TCR2MASK_EL1, x3
+
 msr TCR2MASK_EL2, x3
-// CHECK: msr	TCR2MASK_EL2, x3                // encoding: [0x63,0x27,0x1c,0xd5]
+// CHECK-INST: msr TCR2MASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x27,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2763 msr TCR2MASK_EL2, x3
+
 msr TCR2MASK_EL12, x3
-// CHECK: msr	TCR2MASK_EL12, x3               // encoding: [0x63,0x27,0x1d,0xd5]
+// CHECK-INST: msr TCR2MASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x63,0x27,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d2763 msr TCR2MASK_EL12, x3
+
 msr TCRALIAS_EL1, x3
-// CHECK: msr	TCRALIAS_EL1, x3                // encoding: [0xc3,0x27,0x18,0xd5]
+// CHECK-INST: msr TCRALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0xc3,0x27,0x18,0xd5]
+// CHECK-UNKNOWN:  d51827c3 msr TCRALIAS_EL1, x3
+
 msr TCR2ALIAS_EL1, x3
-// CHECK: msr	TCR2ALIAS_EL1, x3               // encoding: [0xe3,0x27,0x18,0xd5]
+// CHECK-INST: msr TCR2ALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0xe3,0x27,0x18,0xd5]
+// CHECK-UNKNOWN:  d51827e3 msr TCR2ALIAS_EL1, x3
+
 msr ACTLRMASK_EL1, x3
-// CHECK: msr	ACTLRMASK_EL1, x3               // encoding: [0x23,0x14,0x18,0xd5]
+// CHECK-INST: msr ACTLRMASK_EL1, x3
+// CHECK-ENCODING: encoding: [0x23,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d5181423 msr ACTLRMASK_EL1, x3
+
 msr ACTLRMASK_EL2, x3
-// CHECK: msr	ACTLRMASK_EL2, x3               // encoding: [0x23,0x14,0x1c,0xd5]
+// CHECK-INST: msr ACTLRMASK_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x14,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c1423 msr ACTLRMASK_EL2, x3
+
 msr ACTLRMASK_EL12, x3
-// CHECK: msr	ACTLRMASK_EL12, x3              // encoding: [0x23,0x14,0x1d,0xd5]
+// CHECK-INST: msr ACTLRMASK_EL12, x3
+// CHECK-ENCODING: encoding: [0x23,0x14,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d1423 msr ACTLRMASK_EL12, x3
+
 msr ACTLRALIAS_EL1, x3
-// CHECK: msr	ACTLRALIAS_EL1, x3              // encoding: [0xa3,0x14,0x18,0xd5]
+// CHECK-INST: msr ACTLRALIAS_EL1, x3
+// CHECK-ENCODING: encoding: [0xa3,0x14,0x18,0xd5]
+// CHECK-UNKNOWN:  d51814a3 msr ACTLRALIAS_EL1, x3
 
 
 
diff --git a/llvm/test/MC/AArch64/armv9.6a-statistical-profiling.s b/llvm/test/MC/AArch64/armv9.6a-statistical-profiling.s
index 2314c41..4ef6367 100644
--- a/llvm/test/MC/AArch64/armv9.6a-statistical-profiling.s
+++ b/llvm/test/MC/AArch64/armv9.6a-statistical-profiling.s
@@ -1,19 +1,51 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding     < %s 2> %t | FileCheck %s
-
-  msr pmbmar_el1, x0
-  msr pmbsr_el12, x0
-  msr pmbsr_el2, x0
-  msr pmbsr_el3, x0
-// CHECK:     msr PMBMAR_EL1, x0          // encoding: [0xa0,0x9a,0x18,0xd5]
-// CHECK:     msr PMBSR_EL12, x0          // encoding: [0x60,0x9a,0x1d,0xd5]
-// CHECK:     msr PMBSR_EL2, x0           // encoding: [0x60,0x9a,0x1c,0xd5]
-// CHECK:     msr PMBSR_EL3, x0           // encoding: [0x60,0x9a,0x1e,0xd5]
-
-  mrs x0, pmbmar_el1
-  mrs x0, pmbsr_el12
-  mrs x0, pmbsr_el2
-  mrs x0, pmbsr_el3
-// CHECK:    mrs x0, PMBMAR_EL1          // encoding: [0xa0,0x9a,0x38,0xd5]
-// CHECK:    mrs x0, PMBSR_EL12          // encoding: [0x60,0x9a,0x3d,0xd5]
-// CHECK:    mrs x0, PMBSR_EL2           // encoding: [0x60,0x9a,0x3c,0xd5]
-// CHECK:    mrs x0, PMBSR_EL3           // encoding: [0x60,0x9a,0x3e,0xd5]
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+
+msr pmbmar_el1, x0
+// CHECK-INST: msr PMBMAR_EL1, x0
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x18,0xd5]
+// CHECK-UNKNOWN:  d5189aa0 msr PMBMAR_EL1, x0
+
+msr pmbsr_el12, x0
+// CHECK-INST: msr PMBSR_EL12, x0
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d9a60 msr PMBSR_EL12, x0
+
+msr pmbsr_el2, x0
+// CHECK-INST: msr PMBSR_EL2, x0
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c9a60 msr PMBSR_EL2, x0
+
+msr pmbsr_el3, x0
+// CHECK-INST: msr PMBSR_EL3, x0
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e9a60 msr PMBSR_EL3, x0
+
+mrs x0, pmbmar_el1
+// CHECK-INST: mrs x0, PMBMAR_EL1
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x38,0xd5]
+// CHECK-UNKNOWN:  d5389aa0 mrs x0, PMBMAR_EL1
+
+mrs x0, pmbsr_el12
+// CHECK-INST: mrs x0, PMBSR_EL12
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d9a60 mrs x0, PMBSR_EL12
+
+mrs x0, pmbsr_el2
+// CHECK-INST: mrs x0, PMBSR_EL2
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c9a60 mrs x0, PMBSR_EL2
+
+mrs x0, pmbsr_el3
+// CHECK-INST: mrs x0, PMBSR_EL3
+// CHECK-ENCODING: encoding: [0x60,0x9a,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e9a60 mrs x0, PMBSR_EL3
diff --git a/llvm/test/MC/AArch64/armv9.6a-trbe-exception.s b/llvm/test/MC/AArch64/armv9.6a-trbe-exception.s
index a8ba7c4..fb795aa 100644
--- a/llvm/test/MC/AArch64/armv9.6a-trbe-exception.s
+++ b/llvm/test/MC/AArch64/armv9.6a-trbe-exception.s
@@ -1,15 +1,41 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding      < %s 2> %t | FileCheck %s
-
-  msr trbsr_el12, x0
-  msr trbsr_el2, x0
-  msr trbsr_el3, x0
-// CHECK:     msr TRBSR_EL12, x0          // encoding: [0x60,0x9b,0x1d,0xd5]
-// CHECK:     msr TRBSR_EL2, x0           // encoding: [0x60,0x9b,0x1c,0xd5]
-// CHECK:     msr TRBSR_EL3, x0           // encoding: [0x60,0x9b,0x1e,0xd5]
-
-  mrs x0, trbsr_el12
-  mrs x0, trbsr_el2
-  mrs x0, trbsr_el3
-// CHECK:    mrs x0, TRBSR_EL12          // encoding: [0x60,0x9b,0x3d,0xd5]
-// CHECK:    mrs x0, TRBSR_EL2           // encoding: [0x60,0x9b,0x3c,0xd5]
-// CHECK:    mrs x0, TRBSR_EL3           // encoding: [0x60,0x9b,0x3e,0xd5]
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+
+msr trbsr_el12, x0
+// CHECK-INST: msr TRBSR_EL12, x0
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x1d,0xd5]
+// CHECK-UNKNOWN:  d51d9b60 msr TRBSR_EL12, x0
+
+msr trbsr_el2, x0
+// CHECK-INST: msr TRBSR_EL2, x0
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c9b60 msr TRBSR_EL2, x0
+
+msr trbsr_el3, x0
+// CHECK-INST: msr TRBSR_EL3, x0
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x1e,0xd5]
+// CHECK-UNKNOWN:  d51e9b60 msr TRBSR_EL3, x0
+
+mrs x0, trbsr_el12
+// CHECK-INST: mrs x0, TRBSR_EL12
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x3d,0xd5]
+// CHECK-UNKNOWN:  d53d9b60 mrs x0, TRBSR_EL12
+
+mrs x0, trbsr_el2
+// CHECK-INST: mrs x0, TRBSR_EL2
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x3c,0xd5]
+// CHECK-UNKNOWN:  d53c9b60 mrs x0, TRBSR_EL2
+
+mrs x0, trbsr_el3
+// CHECK-INST: mrs x0, TRBSR_EL3
+// CHECK-ENCODING: encoding: [0x60,0x9b,0x3e,0xd5]
+// CHECK-UNKNOWN:  d53e9b60 mrs x0, TRBSR_EL3
diff --git a/llvm/test/MC/AArch64/armv9a-sysp.s b/llvm/test/MC/AArch64/armv9a-sysp.s
new file mode 100644
index 0000000..6006575
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9a-sysp.s
@@ -0,0 +1,978 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+tlb-rmi,+xs < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+xs < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+tlb-rmi,+xs < %s \
+// RUN:        | llvm-objdump -d --mattr=+d128,+tlb-rmi,+xs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+d128,+tlb-rmi,+xs < %s \
+// RUN:   | llvm-objdump -d --mattr=-d128,+tlb-rmi,+xs --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+d128,+tlb-rmi,+xs < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+d128,+tlb-rmi,+xs -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+// +tbl-rmi required for RIPA*/RVA*
+// +xs required for *NXS
+
+// sysp #<op1>, <Cn>, <Cm>, #<op2>{, <Xt1>, <Xt2>}
+// registers with 128-bit formats (op0, op1, Cn, Cm, op2)
+// For sysp, op0 is 0
+
+sysp #0, c2, c0, #0, x0, x1// TTBR0_EL1     3  0  2  0  0
+// CHECK-INST: sysp #0, c2, c0, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482000      <unknown>
+
+sysp #0, c2, c0, #1, x0, x1// TTBR1_EL1     3  0  2  0  1
+// CHECK-INST: sysp #0, c2, c0, #1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482020      <unknown>
+
+sysp #0, c7, c4, #0, x0, x1// PAR_EL1       3  0  7  4  0
+// CHECK-INST: sysp #0, c7, c4, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x74,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5487400      <unknown>
+
+sysp #0, c13, c0, #3, x0, x1         // RCWSMASK_EL1  3  0 13  0  3
+// CHECK-INST: sysp #0, c13, c0, #3, x0, x1
+// CHECK-ENCODING: encoding: [0x60,0xd0,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548d060      <unknown>
+
+sysp #0, c13, c0, #6, x0, x1         // RCWMASK_EL1   3  0 13  0  6
+// CHECK-INST: sysp #0, c13, c0, #6, x0, x1
+// CHECK-ENCODING: encoding: [0xc0,0xd0,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548d0c0      <unknown>
+
+sysp #4, c2, c0, #0, x0, x1// TTBR0_EL2     3  4  2  0  0
+// CHECK-INST: sysp #4, c2, c0, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2000      <unknown>
+
+sysp #4, c2, c0, #1, x0, x1// TTBR1_EL2     3  4  2  0  1
+// CHECK-INST: sysp #4, c2, c0, #1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2020      <unknown>
+
+sysp #4, c2, c1, #0, x0, x1// VTTBR_EL2     3  4  2  1  0
+// CHECK-INST: sysp #4, c2, c1, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2100      <unknown>
+
+
+
+sysp #0, c2, c0, #0, x0, x1
+// CHECK-INST: sysp #0, c2, c0, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482000      <unknown>
+
+sysp #0, c2, c0, #1, x0, x1
+// CHECK-INST: sysp #0, c2, c0, #1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482020      <unknown>
+
+sysp #0, c7, c4, #0, x0, x1
+// CHECK-INST: sysp #0, c7, c4, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x74,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5487400      <unknown>
+
+sysp #0, c13, c0, #3, x0, x1
+// CHECK-INST: sysp #0, c13, c0, #3, x0, x1
+// CHECK-ENCODING: encoding: [0x60,0xd0,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548d060      <unknown>
+
+sysp #0, c13, c0, #6, x0, x1
+// CHECK-INST: sysp #0, c13, c0, #6, x0, x1
+// CHECK-ENCODING: encoding: [0xc0,0xd0,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548d0c0      <unknown>
+
+sysp #4, c2, c0, #0, x0, x1
+// CHECK-INST: sysp #4, c2, c0, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2000      <unknown>
+
+sysp #4, c2, c0, #1, x0, x1
+// CHECK-INST: sysp #4, c2, c0, #1, x0, x1
+// CHECK-ENCODING: encoding: [0x20,0x20,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2020      <unknown>
+
+sysp #4, c2, c1, #0, x0, x1
+// CHECK-INST: sysp #4, c2, c1, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x21,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c2100      <unknown>
+
+sysp #0, c2, c0, #0, x0, x1
+// CHECK-INST: sysp #0, c2, c0, #0, x0, x1
+// CHECK-ENCODING: encoding: [0x00,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482000      <unknown>
+
+sysp #0, c2, c0, #0, x2, x3
+// CHECK-INST: sysp #0, c2, c0, #0, x2, x3
+// CHECK-ENCODING: encoding: [0x02,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482002      <unknown>
+
+sysp #0, c2, c0, #0, x4, x5
+// CHECK-INST: sysp #0, c2, c0, #0, x4, x5
+// CHECK-ENCODING: encoding: [0x04,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482004      <unknown>
+
+sysp #0, c2, c0, #0, x6, x7
+// CHECK-INST: sysp #0, c2, c0, #0, x6, x7
+// CHECK-ENCODING: encoding: [0x06,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482006      <unknown>
+
+sysp #0, c2, c0, #0, x8, x9
+// CHECK-INST: sysp #0, c2, c0, #0, x8, x9
+// CHECK-ENCODING: encoding: [0x08,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482008      <unknown>
+
+sysp #0, c2, c0, #0, x10, x11
+// CHECK-INST: sysp #0, c2, c0, #0, x10, x11
+// CHECK-ENCODING: encoding: [0x0a,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548200a      <unknown>
+
+sysp #0, c2, c0, #0, x12, x13
+// CHECK-INST: sysp #0, c2, c0, #0, x12, x13
+// CHECK-ENCODING: encoding: [0x0c,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548200c      <unknown>
+
+sysp #0, c2, c0, #0, x14, x15
+// CHECK-INST: sysp #0, c2, c0, #0, x14, x15
+// CHECK-ENCODING: encoding: [0x0e,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548200e      <unknown>
+
+sysp #0, c2, c0, #0, x16, x17
+// CHECK-INST: sysp #0, c2, c0, #0, x16, x17
+// CHECK-ENCODING: encoding: [0x10,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482010      <unknown>
+
+sysp #0, c2, c0, #0, x18, x19
+// CHECK-INST: sysp #0, c2, c0, #0, x18, x19
+// CHECK-ENCODING: encoding: [0x12,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482012      <unknown>
+
+sysp #0, c2, c0, #0, x20, x21
+// CHECK-INST: sysp #0, c2, c0, #0, x20, x21
+// CHECK-ENCODING: encoding: [0x14,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482014      <unknown>
+
+sysp #0, c2, c0, #0, x22, x23
+// CHECK-INST: sysp #0, c2, c0, #0, x22, x23
+// CHECK-ENCODING: encoding: [0x16,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482016      <unknown>
+
+sysp #0, c2, c0, #0, x24, x25
+// CHECK-INST: sysp #0, c2, c0, #0, x24, x25
+// CHECK-ENCODING: encoding: [0x18,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5482018      <unknown>
+
+sysp #0, c2, c0, #0, x26, x27
+// CHECK-INST: sysp #0, c2, c0, #0, x26, x27
+// CHECK-ENCODING: encoding: [0x1a,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201a      <unknown>
+
+sysp #0, c2, c0, #0, x28, x29
+// CHECK-INST: sysp #0, c2, c0, #0, x28, x29
+// CHECK-ENCODING: encoding: [0x1c,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201c      <unknown>
+
+sysp #0, c2, c0, #0, x30, x31
+// CHECK-INST: sysp #0, c2, c0, #0, x30, xzr
+// CHECK-ENCODING: encoding: [0x1e,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201e      <unknown>
+
+
+sysp #0, c2, c0, #0, x31, x31
+// CHECK-INST: sysp #0, c2, c0, #0
+// CHECK-ENCODING: encoding: [0x1f,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201f      <unknown>
+
+sysp #0, c2, c0, #0, xzr, xzr
+// CHECK-INST: sysp #0, c2, c0, #0
+// CHECK-ENCODING: encoding: [0x1f,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201f      <unknown>
+
+sysp #0, c2, c0, #0, x31, xzr
+// CHECK-INST: sysp #0, c2, c0, #0
+// CHECK-ENCODING: encoding: [0x1f,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201f      <unknown>
+
+sysp #0, c2, c0, #0, xzr, x31
+// CHECK-INST: sysp #0, c2, c0, #0
+// CHECK-ENCODING: encoding: [0x1f,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201f      <unknown>
+
+sysp #0, c2, c0, #0
+// CHECK-INST: sysp #0, c2, c0, #0
+// CHECK-ENCODING: encoding: [0x1f,0x20,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d548201f      <unknown>
+
+tlbip IPAS2E1, x4, x5
+// CHECK-INST: tlbip ipas2e1, x4, x5
+// CHECK-ENCODING: encoding: [0x24,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8424      <unknown>
+
+tlbip IPAS2E1NXS, x4, x5
+// CHECK-INST: tlbip ipas2e1nxs, x4, x5
+// CHECK-ENCODING: encoding: [0x24,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9424      <unknown>
+
+tlbip IPAS2E1IS, x4, x5
+// CHECK-INST: tlbip ipas2e1is, x4, x5
+// CHECK-ENCODING: encoding: [0x24,0x80,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8024      <unknown>
+
+tlbip IPAS2E1ISNXS, x4, x5
+// CHECK-INST: tlbip ipas2e1isnxs, x4, x5
+// CHECK-ENCODING: encoding: [0x24,0x90,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9024      <unknown>
+
+tlbip IPAS2E1OS, x4, x5
+// CHECK-INST: tlbip ipas2e1os, x4, x5
+// CHECK-ENCODING: encoding: [0x04,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8404      <unknown>
+
+tlbip IPAS2E1OSNXS, x4, x5
+// CHECK-INST: tlbip ipas2e1osnxs, x4, x5
+// CHECK-ENCODING: encoding: [0x04,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9404      <unknown>
+
+tlbip IPAS2LE1, x4, x5
+// CHECK-INST: tlbip ipas2le1, x4, x5
+// CHECK-ENCODING: encoding: [0xa4,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c84a4      <unknown>
+
+tlbip IPAS2LE1NXS, x4, x5
+// CHECK-INST: tlbip ipas2le1nxs, x4, x5
+// CHECK-ENCODING: encoding: [0xa4,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c94a4      <unknown>
+
+tlbip IPAS2LE1IS, x4, x5
+// CHECK-INST: tlbip ipas2le1is, x4, x5
+// CHECK-ENCODING: encoding: [0xa4,0x80,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c80a4      <unknown>
+
+tlbip IPAS2LE1ISNXS, x4, x5
+// CHECK-INST: tlbip ipas2le1isnxs, x4, x5
+// CHECK-ENCODING: encoding: [0xa4,0x90,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c90a4      <unknown>
+
+tlbip IPAS2LE1OS, x4, x5
+// CHECK-INST: tlbip ipas2le1os, x4, x5
+// CHECK-ENCODING: encoding: [0x84,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8484      <unknown>
+
+tlbip IPAS2LE1OSNXS, x4, x5
+// CHECK-INST: tlbip ipas2le1osnxs, x4, x5
+// CHECK-ENCODING: encoding: [0x84,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9484      <unknown>
+
+tlbip VAE1, x8, x9
+// CHECK-INST: tlbip vae1, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x87,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488728      <unknown>
+
+tlbip VAE1NXS, x8, x9
+// CHECK-INST: tlbip vae1nxs, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x97,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489728      <unknown>
+
+tlbip VAE1IS, x8, x9
+// CHECK-INST: tlbip vae1is, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x83,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488328      <unknown>
+
+tlbip VAE1ISNXS, x8, x9
+// CHECK-INST: tlbip vae1isnxs, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x93,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489328      <unknown>
+
+tlbip VAE1OS, x8, x9
+// CHECK-INST: tlbip vae1os, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x81,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488128      <unknown>
+
+tlbip VAE1OSNXS, x8, x9
+// CHECK-INST: tlbip vae1osnxs, x8, x9
+// CHECK-ENCODING: encoding: [0x28,0x91,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489128      <unknown>
+
+tlbip VALE1, x8, x9
+// CHECK-INST: tlbip vale1, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x87,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54887a8      <unknown>
+
+tlbip VALE1NXS, x8, x9
+// CHECK-INST: tlbip vale1nxs, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x97,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54897a8      <unknown>
+
+tlbip VALE1IS, x8, x9
+// CHECK-INST: tlbip vale1is, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x83,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54883a8      <unknown>
+
+tlbip VALE1ISNXS, x8, x9
+// CHECK-INST: tlbip vale1isnxs, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x93,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54893a8      <unknown>
+
+tlbip VALE1OS, x8, x9
+// CHECK-INST: tlbip vale1os, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x81,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54881a8      <unknown>
+
+tlbip VALE1OSNXS, x8, x9
+// CHECK-INST: tlbip vale1osnxs, x8, x9
+// CHECK-ENCODING: encoding: [0xa8,0x91,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54891a8      <unknown>
+
+tlbip VAAE1, x8, x9
+// CHECK-INST: tlbip vaae1, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x87,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488768      <unknown>
+
+tlbip VAAE1NXS, x8, x9
+// CHECK-INST: tlbip vaae1nxs, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x97,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489768      <unknown>
+
+tlbip VAAE1IS, x8, x9
+// CHECK-INST: tlbip vaae1is, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x83,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488368      <unknown>
+
+tlbip VAAE1ISNXS, x8, x9
+// CHECK-INST: tlbip vaae1isnxs, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x93,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489368      <unknown>
+
+tlbip VAAE1OS, x8, x9
+// CHECK-INST: tlbip vaae1os, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x81,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488168      <unknown>
+
+tlbip VAAE1OSNXS, x8, x9
+// CHECK-INST: tlbip vaae1osnxs, x8, x9
+// CHECK-ENCODING: encoding: [0x68,0x91,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489168      <unknown>
+
+tlbip VAALE1, x8, x9
+// CHECK-INST: tlbip vaale1, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x87,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54887e8      <unknown>
+
+tlbip VAALE1NXS, x8, x9
+// CHECK-INST: tlbip vaale1nxs, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x97,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54897e8      <unknown>
+
+tlbip VAALE1IS, x8, x9
+// CHECK-INST: tlbip vaale1is, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x83,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54883e8      <unknown>
+
+tlbip VAALE1ISNXS, x8, x9
+// CHECK-INST: tlbip vaale1isnxs, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x93,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54893e8      <unknown>
+
+tlbip VAALE1OS, x8, x9
+// CHECK-INST: tlbip vaale1os, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x81,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54881e8      <unknown>
+
+tlbip VAALE1OSNXS, x8, x9
+// CHECK-INST: tlbip vaale1osnxs, x8, x9
+// CHECK-ENCODING: encoding: [0xe8,0x91,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54891e8      <unknown>
+
+tlbip VAE2, x14, x15
+// CHECK-INST: tlbip vae2, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x87,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c872e      <unknown>
+
+tlbip VAE2NXS, x14, x15
+// CHECK-INST: tlbip vae2nxs, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x97,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c972e      <unknown>
+
+tlbip VAE2IS, x14, x15
+// CHECK-INST: tlbip vae2is, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x83,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c832e      <unknown>
+
+tlbip VAE2ISNXS, x14, x15
+// CHECK-INST: tlbip vae2isnxs, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x93,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c932e      <unknown>
+
+tlbip VAE2OS, x14, x15
+// CHECK-INST: tlbip vae2os, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x81,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c812e      <unknown>
+
+tlbip VAE2OSNXS, x14, x15
+// CHECK-INST: tlbip vae2osnxs, x14, x15
+// CHECK-ENCODING: encoding: [0x2e,0x91,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c912e      <unknown>
+
+tlbip VALE2, x14, x15
+// CHECK-INST: tlbip vale2, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x87,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c87ae      <unknown>
+
+tlbip VALE2NXS, x14, x15
+// CHECK-INST: tlbip vale2nxs, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x97,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c97ae      <unknown>
+
+tlbip VALE2IS, x14, x15
+// CHECK-INST: tlbip vale2is, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x83,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c83ae      <unknown>
+
+tlbip VALE2ISNXS, x14, x15
+// CHECK-INST: tlbip vale2isnxs, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x93,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c93ae      <unknown>
+
+tlbip VALE2OS, x14, x15
+// CHECK-INST: tlbip vale2os, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x81,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c81ae      <unknown>
+
+tlbip VALE2OSNXS, x14, x15
+// CHECK-INST: tlbip vale2osnxs, x14, x15
+// CHECK-ENCODING: encoding: [0xae,0x91,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c91ae      <unknown>
+
+tlbip VAE3, x24, x25
+// CHECK-INST: tlbip vae3, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x87,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e8738      <unknown>
+
+tlbip VAE3NXS, x24, x25
+// CHECK-INST: tlbip vae3nxs, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x97,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e9738      <unknown>
+
+tlbip VAE3IS, x24, x25
+// CHECK-INST: tlbip vae3is, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x83,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e8338      <unknown>
+
+tlbip VAE3ISNXS, x24, x25
+// CHECK-INST: tlbip vae3isnxs, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x93,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e9338      <unknown>
+
+tlbip VAE3OS, x24, x25
+// CHECK-INST: tlbip vae3os, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x81,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e8138      <unknown>
+
+tlbip VAE3OSNXS, x24, x25
+// CHECK-INST: tlbip vae3osnxs, x24, x25
+// CHECK-ENCODING: encoding: [0x38,0x91,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e9138      <unknown>
+
+tlbip VALE3, x24, x25
+// CHECK-INST: tlbip vale3, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x87,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e87b8      <unknown>
+
+tlbip VALE3NXS, x24, x25
+// CHECK-INST: tlbip vale3nxs, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x97,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e97b8      <unknown>
+
+tlbip VALE3IS, x24, x25
+// CHECK-INST: tlbip vale3is, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x83,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e83b8      <unknown>
+
+tlbip VALE3ISNXS, x24, x25
+// CHECK-INST: tlbip vale3isnxs, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x93,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e93b8      <unknown>
+
+tlbip VALE3OS, x24, x25
+// CHECK-INST: tlbip vale3os, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x81,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e81b8      <unknown>
+
+tlbip VALE3OSNXS, x24, x25
+// CHECK-INST: tlbip vale3osnxs, x24, x25
+// CHECK-ENCODING: encoding: [0xb8,0x91,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e91b8      <unknown>
+
+tlbip RVAE1, x18, x19
+// CHECK-INST: tlbip rvae1, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x86,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488632      <unknown>
+
+tlbip RVAE1NXS, x18, x19
+// CHECK-INST: tlbip rvae1nxs, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x96,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489632      <unknown>
+
+tlbip RVAE1IS, x18, x19
+// CHECK-INST: tlbip rvae1is, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x82,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488232      <unknown>
+
+tlbip RVAE1ISNXS, x18, x19
+// CHECK-INST: tlbip rvae1isnxs, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x92,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489232      <unknown>
+
+tlbip RVAE1OS, x18, x19
+// CHECK-INST: tlbip rvae1os, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x85,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488532      <unknown>
+
+tlbip RVAE1OSNXS, x18, x19
+// CHECK-INST: tlbip rvae1osnxs, x18, x19
+// CHECK-ENCODING: encoding: [0x32,0x95,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489532      <unknown>
+
+tlbip RVAAE1, x18, x19
+// CHECK-INST: tlbip rvaae1, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x86,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488672      <unknown>
+
+tlbip RVAAE1NXS, x18, x19
+// CHECK-INST: tlbip rvaae1nxs, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x96,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489672      <unknown>
+
+tlbip RVAAE1IS, x18, x19
+// CHECK-INST: tlbip rvaae1is, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x82,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488272      <unknown>
+
+tlbip RVAAE1ISNXS, x18, x19
+// CHECK-INST: tlbip rvaae1isnxs, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x92,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489272      <unknown>
+
+tlbip RVAAE1OS, x18, x19
+// CHECK-INST: tlbip rvaae1os, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x85,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5488572      <unknown>
+
+tlbip RVAAE1OSNXS, x18, x19
+// CHECK-INST: tlbip rvaae1osnxs, x18, x19
+// CHECK-ENCODING: encoding: [0x72,0x95,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d5489572      <unknown>
+
+tlbip RVALE1, x18, x19
+// CHECK-INST: tlbip rvale1, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x86,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54886b2      <unknown>
+
+tlbip RVALE1NXS, x18, x19
+// CHECK-INST: tlbip rvale1nxs, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x96,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54896b2      <unknown>
+
+tlbip RVALE1IS, x18, x19
+// CHECK-INST: tlbip rvale1is, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x82,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54882b2      <unknown>
+
+tlbip RVALE1ISNXS, x18, x19
+// CHECK-INST: tlbip rvale1isnxs, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x92,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54892b2      <unknown>
+
+tlbip RVALE1OS, x18, x19
+// CHECK-INST: tlbip rvale1os, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x85,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54885b2      <unknown>
+
+tlbip RVALE1OSNXS, x18, x19
+// CHECK-INST: tlbip rvale1osnxs, x18, x19
+// CHECK-ENCODING: encoding: [0xb2,0x95,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54895b2      <unknown>
+
+tlbip RVAALE1, x18, x19
+// CHECK-INST: tlbip rvaale1, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x86,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54886f2      <unknown>
+
+tlbip RVAALE1NXS, x18, x19
+// CHECK-INST: tlbip rvaale1nxs, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x96,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54896f2      <unknown>
+
+tlbip RVAALE1IS, x18, x19
+// CHECK-INST: tlbip rvaale1is, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x82,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54882f2      <unknown>
+
+tlbip RVAALE1ISNXS, x18, x19
+// CHECK-INST: tlbip rvaale1isnxs, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x92,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54892f2      <unknown>
+
+tlbip RVAALE1OS, x18, x19
+// CHECK-INST: tlbip rvaale1os, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x85,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54885f2      <unknown>
+
+tlbip RVAALE1OSNXS, x18, x19
+// CHECK-INST: tlbip rvaale1osnxs, x18, x19
+// CHECK-ENCODING: encoding: [0xf2,0x95,0x48,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54895f2      <unknown>
+
+tlbip RVAE2, x28, x29
+// CHECK-INST: tlbip rvae2, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x86,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c863c      <unknown>
+
+tlbip RVAE2NXS, x28, x29
+// CHECK-INST: tlbip rvae2nxs, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x96,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c963c      <unknown>
+
+tlbip RVAE2IS, x28, x29
+// CHECK-INST: tlbip rvae2is, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x82,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c823c      <unknown>
+
+tlbip RVAE2ISNXS, x28, x29
+// CHECK-INST: tlbip rvae2isnxs, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x92,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c923c      <unknown>
+
+tlbip RVAE2OS, x28, x29
+// CHECK-INST: tlbip rvae2os, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x85,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c853c      <unknown>
+
+tlbip RVAE2OSNXS, x28, x29
+// CHECK-INST: tlbip rvae2osnxs, x28, x29
+// CHECK-ENCODING: encoding: [0x3c,0x95,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c953c      <unknown>
+
+tlbip RVALE2, x28, x29
+// CHECK-INST: tlbip rvale2, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x86,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c86bc      <unknown>
+
+tlbip RVALE2NXS, x28, x29
+// CHECK-INST: tlbip rvale2nxs, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x96,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c96bc      <unknown>
+
+tlbip RVALE2IS, x28, x29
+// CHECK-INST: tlbip rvale2is, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x82,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c82bc      <unknown>
+
+tlbip RVALE2ISNXS, x28, x29
+// CHECK-INST: tlbip rvale2isnxs, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x92,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c92bc      <unknown>
+
+tlbip RVALE2OS, x28, x29
+// CHECK-INST: tlbip rvale2os, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x85,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c85bc      <unknown>
+
+tlbip RVALE2OSNXS, x28, x29
+// CHECK-INST: tlbip rvale2osnxs, x28, x29
+// CHECK-ENCODING: encoding: [0xbc,0x95,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c95bc      <unknown>
+
+tlbip RVAE3, x10, x11
+// CHECK-INST: tlbip rvae3, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x86,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e862a      <unknown>
+
+tlbip RVAE3NXS, x10, x11
+// CHECK-INST: tlbip rvae3nxs, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x96,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e962a      <unknown>
+
+tlbip RVAE3IS, x10, x11
+// CHECK-INST: tlbip rvae3is, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x82,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e822a      <unknown>
+
+tlbip RVAE3ISNXS, x10, x11
+// CHECK-INST: tlbip rvae3isnxs, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x92,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e922a      <unknown>
+
+tlbip RVAE3OS, x10, x11
+// CHECK-INST: tlbip rvae3os, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x85,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e852a      <unknown>
+
+tlbip RVAE3OSNXS, x10, x11
+// CHECK-INST: tlbip rvae3osnxs, x10, x11
+// CHECK-ENCODING: encoding: [0x2a,0x95,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e952a      <unknown>
+
+tlbip RVALE3, x10, x11
+// CHECK-INST: tlbip rvale3, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x86,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e86aa      <unknown>
+
+tlbip RVALE3NXS, x10, x11
+// CHECK-INST: tlbip rvale3nxs, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x96,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e96aa      <unknown>
+
+tlbip RVALE3IS, x10, x11
+// CHECK-INST: tlbip rvale3is, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x82,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e82aa      <unknown>
+
+tlbip RVALE3ISNXS, x10, x11
+// CHECK-INST: tlbip rvale3isnxs, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x92,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e92aa      <unknown>
+
+tlbip RVALE3OS, x10, x11
+// CHECK-INST: tlbip rvale3os, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x85,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e85aa      <unknown>
+
+tlbip RVALE3OSNXS, x10, x11
+// CHECK-INST: tlbip rvale3osnxs, x10, x11
+// CHECK-ENCODING: encoding: [0xaa,0x95,0x4e,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54e95aa      <unknown>
+
+tlbip RIPAS2E1, x20, x21
+// CHECK-INST: tlbip ripas2e1, x20, x21
+// CHECK-ENCODING: encoding: [0x54,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8454      <unknown>
+
+tlbip RIPAS2E1NXS, x20, x21
+// CHECK-INST: tlbip ripas2e1nxs, x20, x21
+// CHECK-ENCODING: encoding: [0x54,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9454      <unknown>
+
+tlbip RIPAS2E1IS, x20, x21
+// CHECK-INST: tlbip ripas2e1is, x20, x21
+// CHECK-ENCODING: encoding: [0x54,0x80,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8054      <unknown>
+
+tlbip RIPAS2E1ISNXS, x20, x21
+// CHECK-INST: tlbip ripas2e1isnxs, x20, x21
+// CHECK-ENCODING: encoding: [0x54,0x90,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9054      <unknown>
+
+tlbip RIPAS2E1OS, x20, x21
+// CHECK-INST: tlbip ripas2e1os, x20, x21
+// CHECK-ENCODING: encoding: [0x74,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c8474      <unknown>
+
+tlbip RIPAS2E1OSNXS, x20, x21
+// CHECK-INST: tlbip ripas2e1osnxs, x20, x21
+// CHECK-ENCODING: encoding: [0x74,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c9474      <unknown>
+
+tlbip RIPAS2LE1, x20, x21
+// CHECK-INST: tlbip ripas2le1, x20, x21
+// CHECK-ENCODING: encoding: [0xd4,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c84d4      <unknown>
+
+tlbip RIPAS2LE1NXS, x20, x21
+// CHECK-INST: tlbip ripas2le1nxs, x20, x21
+// CHECK-ENCODING: encoding: [0xd4,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c94d4      <unknown>
+
+tlbip RIPAS2LE1IS, x20, x21
+// CHECK-INST: tlbip ripas2le1is, x20, x21
+// CHECK-ENCODING: encoding: [0xd4,0x80,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c80d4      <unknown>
+
+tlbip RIPAS2LE1ISNXS, x20, x21
+// CHECK-INST: tlbip ripas2le1isnxs, x20, x21
+// CHECK-ENCODING: encoding: [0xd4,0x90,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c90d4      <unknown>
+
+tlbip RIPAS2LE1OS, x20, x21
+// CHECK-INST: tlbip ripas2le1os, x20, x21
+// CHECK-ENCODING: encoding: [0xf4,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c84f4      <unknown>
+
+tlbip RIPAS2LE1OSNXS, x20, x21
+// CHECK-INST: tlbip ripas2le1osnxs, x20, x21
+// CHECK-ENCODING: encoding: [0xf4,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c94f4      <unknown>
+
+tlbip RIPAS2LE1OS, xzr, xzr
+// CHECK-INST: tlbip ripas2le1os, xzr, xzr
+// CHECK-ENCODING: encoding: [0xff,0x84,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c84ff      <unknown>
+
+tlbip RIPAS2LE1OSNXS, xzr, xzr
+// CHECK-INST: tlbip ripas2le1osnxs, xzr, xzr
+// CHECK-ENCODING: encoding: [0xff,0x94,0x4c,0xd5]
+// CHECK-ERROR: error: instruction requires: d128
+// CHECK-UNKNOWN:  d54c94ff      <unknown>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s b/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s
index 100fc98..91b3fcb 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_operands.s
@@ -52,3 +52,7 @@ s_setreg_b32 hwreg(34), s1
 s_setreg_b32 hwreg(HW_REG_XNACK_MASK), s1
 // GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX1250: encoding: [0x22,0xf8,0x01,0xb9]
+
+s_setreg_b32 hwreg(HW_REG_IB_STS2), s1
+// GFX1200-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX1250: encoding: [0x1c,0xf8,0x01,0xb9]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 234c2ed..bfc3544 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -48,3 +48,10 @@ s_monitor_sleep 32768
 s_monitor_sleep 0
 // GFX1250: s_monitor_sleep 0                       ; encoding: [0x00,0x00,0x84,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_sendmsg sendmsg(MSG_SAVEWAVE_HAS_TDM)
+// GFX1250: s_sendmsg sendmsg(MSG_SAVEWAVE_HAS_TDM)     ; encoding: [0x0a,0x00,0xb6,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified message id is not supported on this GPU
+
+s_barrier_wait -3
+// GFX1250: s_barrier_wait -3                       ; encoding: [0xfd,0xff,0x94,0xbf]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp_err.s
new file mode 100644
index 0000000..d151c9a
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp_err.s
@@ -0,0 +1,4 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
+
+s_setkill 0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.6a-fgt.txt b/llvm/test/MC/Disassembler/AArch64/armv8.6a-fgt.txt
deleted file mode 100644
index 5b8d817..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.6a-fgt.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-# RUN: llvm-mc -triple=aarch64  -mattr=+fgt -disassemble < %s      | FileCheck %s
-# RUN: llvm-mc -triple=aarch64              -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOFGT
-
-[0x80,0x11,0x1c,0xd5]
-[0xa0,0x11,0x1c,0xd5]
-[0xc0,0x11,0x1c,0xd5]
-[0x80,0x31,0x1c,0xd5]
-[0xa0,0x31,0x1c,0xd5]
-[0xc0,0x31,0x1c,0xd5]
-
-# CHECK: msr HFGRTR_EL2, x0
-# CHECK: msr HFGWTR_EL2, x0
-# CHECK: msr HFGITR_EL2, x0
-# CHECK: msr HDFGRTR_EL2, x0
-# CHECK: msr HDFGWTR_EL2, x0
-# CHECK: msr HAFGRTR_EL2, x0
-# NOFGT: msr S3_4_C1_C1_4, x0
-# NOFGT: msr S3_4_C1_C1_5, x0
-# NOFGT: msr S3_4_C1_C1_6, x0
-# NOFGT: msr S3_4_C3_C1_4, x0
-# NOFGT: msr S3_4_C3_C1_5, x0
-# NOFGT: msr S3_4_C3_C1_6, x0
-
-[0x80,0x11,0x3c,0xd5]
-[0xa0,0x11,0x3c,0xd5]
-[0xc0,0x11,0x3c,0xd5]
-[0x80,0x31,0x3c,0xd5]
-[0xa0,0x31,0x3c,0xd5]
-[0xc0,0x31,0x3c,0xd5]
-
-# CHECK: mrs x0, HFGRTR_EL2
-# CHECK: mrs x0, HFGWTR_EL2
-# CHECK: mrs x0, HFGITR_EL2
-# CHECK: mrs x0, HDFGRTR_EL2
-# CHECK: mrs x0, HDFGWTR_EL2
-# CHECK: mrs x0, HAFGRTR_EL2
-# NOFGT: mrs x0, S3_4_C1_C1_4
-# NOFGT: mrs x0, S3_4_C1_C1_5
-# NOFGT: mrs x0, S3_4_C1_C1_6
-# NOFGT: mrs x0, S3_4_C3_C1_4
-# NOFGT: mrs x0, S3_4_C3_C1_5
-# NOFGT: mrs x0, S3_4_C3_C1_6
-
-[0x03,0x31,0x3c,0xd5]
-[0x23,0x31,0x3c,0xd5]
-[0x43,0x31,0x3c,0xd5]
-[0x63,0x31,0x3c,0xd5]
-[0xe3,0x31,0x3c,0xd5]
-# CHECK: mrs x3, HDFGRTR2_EL2
-# CHECK: mrs x3, HDFGWTR2_EL2
-# CHECK: mrs x3, HFGRTR2_EL2
-# CHECK: mrs x3, HFGWTR2_EL2
-# CHECK: mrs x3, HFGITR2_EL2
-# NOFGT: mrs x3, S3_4_C3_C1_0
-# NOFGT: mrs x3, S3_4_C3_C1_1
-# NOFGT: mrs x3, S3_4_C3_C1_2
-# NOFGT: mrs x3, S3_4_C3_C1_3
-# NOFGT: mrs x3, S3_4_C3_C1_7
-
-
-[0x03,0x31,0x1c,0xd5]
-[0x23,0x31,0x1c,0xd5]
-[0x43,0x31,0x1c,0xd5]
-[0x63,0x31,0x1c,0xd5]
-[0xe3,0x31,0x1c,0xd5]
-# CHECK: msr HDFGRTR2_EL2, x3
-# CHECK: msr HDFGWTR2_EL2, x3
-# CHECK: msr HFGRTR2_EL2, x3
-# CHECK: msr HFGWTR2_EL2, x3
-# CHECK: msr HFGITR2_EL2, x3
-# NOFGT: msr S3_4_C3_C1_0, x3
-# NOFGT: msr S3_4_C3_C1_1, x3
-# NOFGT: msr S3_4_C3_C1_2, x3
-# NOFGT: msr S3_4_C3_C1_3, x3
-# NOFGT: msr S3_4_C3_C1_7, x3
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.8a-mops.txt b/llvm/test/MC/Disassembler/AArch64/armv8.8a-mops.txt
deleted file mode 100644
index de7121c..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.8a-mops.txt
+++ /dev/null
@@ -1,434 +0,0 @@
-# RUN: not llvm-mc -triple aarch64 -mattr=+mops,+mte -disassemble < %s 2> %t | FileCheck %s --check-prefixes=CHECK-MOPS,CHECK-MTE
-# RUN: FileCheck %s --check-prefix=CHECK-INVALID < %t
-# RUN: not llvm-mc -triple aarch64 -mattr=+v8.8a,+mte -disassemble < %s 2> %t | FileCheck %s --check-prefixes=CHECK-MOPS,CHECK-MTE
-# RUN: FileCheck %s --check-prefix=CHECK-INVALID < %t
-# RUN: not llvm-mc -triple aarch64 -mattr=+mops -disassemble < %s 2> %t | FileCheck %s --check-prefix=CHECK-MOPS
-# RUN: FileCheck %s --check-prefixes=CHECK-INVALID,CHECK-NO-MTE < %t
-# RUN: not llvm-mc -triple aarch64 -mattr=+v8.8a -disassemble < %s 2> %t | FileCheck %s --check-prefix=CHECK-MOPS
-# RUN: FileCheck %s --check-prefixes=CHECK-INVALID,CHECK-NO-MTE < %t
-# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2> %t
-# RUN: FileCheck %s --check-prefixes=CHECK-INVALID,CHECK-NO-MOPS,CHECK-NO-MTE < %t
-
-
-[0x40,0x04,0x01,0x19]
-[0x40,0x44,0x01,0x19]
-[0x40,0x84,0x01,0x19]
-[0x40,0xc4,0x01,0x19]
-[0x40,0x14,0x01,0x19]
-[0x40,0x54,0x01,0x19]
-[0x40,0x94,0x01,0x19]
-[0x40,0xd4,0x01,0x19]
-[0x40,0x24,0x01,0x19]
-[0x40,0x64,0x01,0x19]
-[0x40,0xa4,0x01,0x19]
-[0x40,0xe4,0x01,0x19]
-[0x40,0x34,0x01,0x19]
-[0x40,0x74,0x01,0x19]
-[0x40,0xb4,0x01,0x19]
-[0x40,0xf4,0x01,0x19]
-# CHECK-MOPS: cpyfp	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfprn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpwt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpwtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpwtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpwtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfprt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfprtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfprtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfprtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfpt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfptwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfptrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfptn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x40,0x04,0x41,0x19]
-[0x40,0x44,0x41,0x19]
-[0x40,0x84,0x41,0x19]
-[0x40,0xc4,0x41,0x19]
-[0x40,0x14,0x41,0x19]
-[0x40,0x54,0x41,0x19]
-[0x40,0x94,0x41,0x19]
-[0x40,0xd4,0x41,0x19]
-[0x40,0x24,0x41,0x19]
-[0x40,0x64,0x41,0x19]
-[0x40,0xa4,0x41,0x19]
-[0x40,0xe4,0x41,0x19]
-[0x40,0x34,0x41,0x19]
-[0x40,0x74,0x41,0x19]
-[0x40,0xb4,0x41,0x19]
-[0x40,0xf4,0x41,0x19]
-# CHECK-MOPS: cpyfm	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmwt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmwtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmwtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmwtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmrt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmrtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmrtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmrtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfmtn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x40,0x04,0x81,0x19]
-[0x40,0x44,0x81,0x19]
-[0x40,0x84,0x81,0x19]
-[0x40,0xc4,0x81,0x19]
-[0x40,0x14,0x81,0x19]
-[0x40,0x54,0x81,0x19]
-[0x40,0x94,0x81,0x19]
-[0x40,0xd4,0x81,0x19]
-[0x40,0x24,0x81,0x19]
-[0x40,0x64,0x81,0x19]
-[0x40,0xa4,0x81,0x19]
-[0x40,0xe4,0x81,0x19]
-[0x40,0x34,0x81,0x19]
-[0x40,0x74,0x81,0x19]
-[0x40,0xb4,0x81,0x19]
-[0x40,0xf4,0x81,0x19]
-# CHECK-MOPS: cpyfe	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfewn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfern	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfen	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfewt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfewtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfewtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfewtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfert	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfertwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfertrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfertn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfet	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfetwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfetrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyfetn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x40,0x04,0x01,0x1d]
-[0x40,0x44,0x01,0x1d]
-[0x40,0x84,0x01,0x1d]
-[0x40,0xc4,0x01,0x1d]
-[0x40,0x14,0x01,0x1d]
-[0x40,0x54,0x01,0x1d]
-[0x40,0x94,0x01,0x1d]
-[0x40,0xd4,0x01,0x1d]
-[0x40,0x24,0x01,0x1d]
-[0x40,0x64,0x01,0x1d]
-[0x40,0xa4,0x01,0x1d]
-[0x40,0xe4,0x01,0x1d]
-[0x40,0x34,0x01,0x1d]
-[0x40,0x74,0x01,0x1d]
-[0x40,0xb4,0x01,0x1d]
-[0x40,0xf4,0x01,0x1d]
-# CHECK-MOPS: cpyp	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyprn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypwt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypwtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypwtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypwtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyprt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyprtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyprtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyprtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpypt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyptwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyptrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyptn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x40,0x04,0x41,0x1d]
-[0x40,0x44,0x41,0x1d]
-[0x40,0x84,0x41,0x1d]
-[0x40,0xc4,0x41,0x1d]
-[0x40,0x14,0x41,0x1d]
-[0x40,0x54,0x41,0x1d]
-[0x40,0x94,0x41,0x1d]
-[0x40,0xd4,0x41,0x1d]
-[0x40,0x24,0x41,0x1d]
-[0x40,0x64,0x41,0x1d]
-[0x40,0xa4,0x41,0x1d]
-[0x40,0xe4,0x41,0x1d]
-[0x40,0x34,0x41,0x1d]
-[0x40,0x74,0x41,0x1d]
-[0x40,0xb4,0x41,0x1d]
-[0x40,0xf4,0x41,0x1d]
-# CHECK-MOPS: cpym	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymwt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymwtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymwtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymwtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymrt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymrtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymrtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymrtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpymtn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x40,0x04,0x81,0x1d]
-[0x40,0x44,0x81,0x1d]
-[0x40,0x84,0x81,0x1d]
-[0x40,0xc4,0x81,0x1d]
-[0x40,0x14,0x81,0x1d]
-[0x40,0x54,0x81,0x1d]
-[0x40,0x94,0x81,0x1d]
-[0x40,0xd4,0x81,0x1d]
-[0x40,0x24,0x81,0x1d]
-[0x40,0x64,0x81,0x1d]
-[0x40,0xa4,0x81,0x1d]
-[0x40,0xe4,0x81,0x1d]
-[0x40,0x34,0x81,0x1d]
-[0x40,0x74,0x81,0x1d]
-[0x40,0xb4,0x81,0x1d]
-[0x40,0xf4,0x81,0x1d]
-# CHECK-MOPS: cpye	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyewn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyern	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyen	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyewt	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyewtwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyewtrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyewtn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyert	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyertwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyertrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyertn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyet	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyetwn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyetrn	[x0]!, [x1]!, x2!
-# CHECK-MOPS: cpyetn	[x0]!, [x1]!, x2!
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x20,0x04,0xc2,0x19]
-[0x20,0x14,0xc2,0x19]
-[0x20,0x24,0xc2,0x19]
-[0x20,0x34,0xc2,0x19]
-# CHECK-MOPS: setp	[x0]!, x1!, x2
-# CHECK-MOPS: setpt	[x0]!, x1!, x2
-# CHECK-MOPS: setpn	[x0]!, x1!, x2
-# CHECK-MOPS: setptn	[x0]!, x1!, x2
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x20,0x44,0xc2,0x19]
-[0x20,0x54,0xc2,0x19]
-[0x20,0x64,0xc2,0x19]
-[0x20,0x74,0xc2,0x19]
-# CHECK-MOPS: setm	[x0]!, x1!, x2
-# CHECK-MOPS: setmt	[x0]!, x1!, x2
-# CHECK-MOPS: setmn	[x0]!, x1!, x2
-# CHECK-MOPS: setmtn	[x0]!, x1!, x2
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x20,0x84,0xc2,0x19]
-[0x20,0x94,0xc2,0x19]
-[0x20,0xa4,0xc2,0x19]
-[0x20,0xb4,0xc2,0x19]
-# CHECK-MOPS: sete	[x0]!, x1!, x2
-# CHECK-MOPS: setet	[x0]!, x1!, x2
-# CHECK-MOPS: seten	[x0]!, x1!, x2
-# CHECK-MOPS: setetn	[x0]!, x1!, x2
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-# CHECK-NO-MOPS: warning: invalid instruction encoding
-
-[0x20,0x04,0xc2,0x1d]
-[0x20,0x14,0xc2,0x1d]
-[0x20,0x24,0xc2,0x1d]
-[0x20,0x34,0xc2,0x1d]
-# CHECK-MTE: setgp	[x0]!, x1!, x2
-# CHECK-MTE: setgpt [x0]!, x1!, x2
-# CHECK-MTE: setgpn	[x0]!, x1!, x2
-# CHECK-MTE: setgptn	[x0]!, x1!, x2
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-
-[0x20,0x44,0xc2,0x1d]
-[0x20,0x54,0xc2,0x1d]
-[0x20,0x64,0xc2,0x1d]
-[0x20,0x74,0xc2,0x1d]
-# CHECK-MTE: setgm	[x0]!, x1!, x2
-# CHECK-MTE: setgmt	[x0]!, x1!, x2
-# CHECK-MTE: setgmn	[x0]!, x1!, x2
-# CHECK-MTE: setgmtn	[x0]!, x1!, x2
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-
-[0x20,0x84,0xc2,0x1d]
-[0x20,0x94,0xc2,0x1d]
-[0x20,0xa4,0xc2,0x1d]
-[0x20,0xb4,0xc2,0x1d]
-# CHECK-MTE: setge	[x0]!, x1!, x2
-# CHECK-MTE: setget	[x0]!, x1!, x2
-# CHECK-MTE: setgen	[x0]!, x1!, x2
-# CHECK-MTE: setgetn	[x0]!, x1!, x2
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-# CHECK-NO-MTE: warning: invalid instruction encoding
-
-
-# Register number 31 (SP or XZR) is not allowed in address positions.
-# cpyfp
-[0x5f,0x04,0x01,0x19]
-[0x40,0x04,0x1f,0x19]
-# cpyfm
-[0x5f,0x04,0x41,0x19]
-[0x40,0x04,0x5f,0x19]
-# cpyfe
-[0x5f,0x04,0x81,0x19]
-[0x40,0x04,0x9f,0x19]
-# cpyp
-[0x5f,0x04,0x01,0x1d]
-[0x40,0x04,0x1f,0x1d]
-# cpym
-[0x5f,0x04,0x41,0x1d]
-[0x40,0x04,0x5f,0x1d]
-# cpye
-[0x5f,0x04,0x81,0x1d]
-[0x40,0x04,0x9f,0x1d]
-# setp
-[0x5f,0x04,0xc2,0x19]
-# setm
-[0x5f,0x44,0xc2,0x19]
-# sete
-[0x5f,0x84,0xc2,0x19]
-# setgp
-[0x5f,0x04,0xc2,0x1d]
-# setgm
-[0x5f,0x44,0xc2,0x1d]
-# setge
-[0x5f,0x84,0xc2,0x1d]
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
-# CHECK-INVALID: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-ats1a.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-ats1a.txt
deleted file mode 100644
index 03aca5e..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-ats1a.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble %s | FileCheck %s
-
-[0x41,0x79,0x08,0xd5]
-# CHECK: at s1e1a, x1
-
-[0x41,0x79,0x0c,0xd5]
-# CHECK: at s1e2a, x1
-
-[0x41,0x79,0x0e,0xd5]
-# CHECK: at s1e3a, x1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-clrbhb.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-clrbhb.txt
deleted file mode 100644
index f8c7e9f..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-clrbhb.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# CLRBHB is optional for all v8a/v9a, mandatory for 8.9a/9.4a.
-# Should disassemble to hint #22 if the feature is not present.
-# RUN: llvm-mc -triple=aarch64 -disassemble %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8a %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8.9a,-clrbhb %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.3a %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.4a,-clrbhb %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+clrbhb %s | FileCheck %s --check-prefix=CLRBHB
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8a,+clrbhb %s | FileCheck %s --check-prefix=CLRBHB
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8.9a %s | FileCheck %s --check-prefix=CLRBHB
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.3a,+clrbhb %s | FileCheck %s --check-prefix=CLRBHB
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.4a %s | FileCheck %s --check-prefix=CLRBHB
-
-[0xdf,0x22,0x03,0xd5]
-# CLRBHB: clrbhb
-# HINT_22: hint #22
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-debug-pmu.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-debug-pmu.txt
deleted file mode 100644
index ff898fe..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-debug-pmu.txt
+++ /dev/null
@@ -1,730 +0,0 @@
-# RUN: llvm-mc -triple=aarch64               -mattr=+ite -disassemble %s | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8.8a -mattr=+ite -disassemble %s | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v9.3a -mattr=+ite -disassemble %s | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8.9a -mattr=+ite -disassemble %s | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v9.4a -mattr=+ite -disassemble %s | FileCheck %s
-
-# RUN: llvm-mc -triple=aarch64                           -disassemble %s | FileCheck %s --check-prefix=ERROR-NO-ITE
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8.8a             -disassemble %s | FileCheck %s --check-prefix=ERROR-NO-ITE
-# RUN: llvm-mc -triple=aarch64 -mattr=+v9.3a             -disassemble %s | FileCheck %s --check-prefix=ERROR-NO-ITE
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8.9a             -disassemble %s | FileCheck %s --check-prefix=ERROR-NO-ITE
-# RUN: llvm-mc -triple=aarch64 -mattr=+v9.4a             -disassemble %s | FileCheck %s --check-prefix=ERROR-NO-ITE
-
-[0x83,0x00,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR0_EL1
-[0x81,0x00,0x10,0xd5]
-# CHECK:    msr DBGBVR0_EL1, x1
-[0x83,0x01,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR1_EL1
-[0x81,0x01,0x10,0xd5]
-# CHECK:    msr DBGBVR1_EL1, x1
-[0x83,0x02,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR2_EL1
-[0x81,0x02,0x10,0xd5]
-# CHECK:    msr DBGBVR2_EL1, x1
-[0x83,0x03,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR3_EL1
-[0x81,0x03,0x10,0xd5]
-# CHECK:    msr DBGBVR3_EL1, x1
-[0x83,0x04,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR4_EL1
-[0x81,0x04,0x10,0xd5]
-# CHECK:    msr DBGBVR4_EL1, x1
-[0x83,0x05,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR5_EL1
-[0x81,0x05,0x10,0xd5]
-# CHECK:    msr DBGBVR5_EL1, x1
-[0x83,0x06,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR6_EL1
-[0x81,0x06,0x10,0xd5]
-# CHECK:    msr DBGBVR6_EL1, x1
-[0x83,0x07,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR7_EL1
-[0x81,0x07,0x10,0xd5]
-# CHECK:    msr DBGBVR7_EL1, x1
-[0x83,0x08,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR8_EL1
-[0x81,0x08,0x10,0xd5]
-# CHECK:    msr DBGBVR8_EL1, x1
-[0x83,0x09,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR9_EL1
-[0x81,0x09,0x10,0xd5]
-# CHECK:    msr DBGBVR9_EL1, x1
-[0x83,0x0a,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR10_EL1
-[0x81,0x0a,0x10,0xd5]
-# CHECK:    msr DBGBVR10_EL1, x1
-[0x83,0x0b,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR11_EL1
-[0x81,0x0b,0x10,0xd5]
-# CHECK:    msr DBGBVR11_EL1, x1
-[0x83,0x0c,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR12_EL1
-[0x81,0x0c,0x10,0xd5]
-# CHECK:    msr DBGBVR12_EL1, x1
-[0x83,0x0d,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR13_EL1
-[0x81,0x0d,0x10,0xd5]
-# CHECK:    msr DBGBVR13_EL1, x1
-[0x83,0x0e,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR14_EL1
-[0x81,0x0e,0x10,0xd5]
-# CHECK:    msr DBGBVR14_EL1, x1
-[0x83,0x0f,0x30,0xd5]
-# CHECK:    mrs x3, DBGBVR15_EL1
-[0x81,0x0f,0x10,0xd5]
-# CHECK:    msr DBGBVR15_EL1, x1
-
-[0xa3,0x00,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR0_EL1
-[0xa1,0x00,0x10,0xd5]
-# CHECK:    msr DBGBCR0_EL1, x1
-[0xa3,0x01,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR1_EL1
-[0xa1,0x01,0x10,0xd5]
-# CHECK:    msr DBGBCR1_EL1, x1
-[0xa3,0x02,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR2_EL1
-[0xa1,0x02,0x10,0xd5]
-# CHECK:    msr DBGBCR2_EL1, x1
-[0xa3,0x03,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR3_EL1
-[0xa1,0x03,0x10,0xd5]
-# CHECK:    msr DBGBCR3_EL1, x1
-[0xa3,0x04,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR4_EL1
-[0xa1,0x04,0x10,0xd5]
-# CHECK:    msr DBGBCR4_EL1, x1
-[0xa3,0x05,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR5_EL1
-[0xa1,0x05,0x10,0xd5]
-# CHECK:    msr DBGBCR5_EL1, x1
-[0xa3,0x06,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR6_EL1
-[0xa1,0x06,0x10,0xd5]
-# CHECK:    msr DBGBCR6_EL1, x1
-[0xa3,0x07,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR7_EL1
-[0xa1,0x07,0x10,0xd5]
-# CHECK:    msr DBGBCR7_EL1, x1
-[0xa3,0x08,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR8_EL1
-[0xa1,0x08,0x10,0xd5]
-# CHECK:    msr DBGBCR8_EL1, x1
-[0xa3,0x09,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR9_EL1
-[0xa1,0x09,0x10,0xd5]
-# CHECK:    msr DBGBCR9_EL1, x1
-[0xa3,0x0a,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR10_EL1
-[0xa1,0x0a,0x10,0xd5]
-# CHECK:    msr DBGBCR10_EL1, x1
-[0xa3,0x0b,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR11_EL1
-[0xa1,0x0b,0x10,0xd5]
-# CHECK:    msr DBGBCR11_EL1, x1
-[0xa3,0x0c,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR12_EL1
-[0xa1,0x0c,0x10,0xd5]
-# CHECK:    msr DBGBCR12_EL1, x1
-[0xa3,0x0d,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR13_EL1
-[0xa1,0x0d,0x10,0xd5]
-# CHECK:    msr DBGBCR13_EL1, x1
-[0xa3,0x0e,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR14_EL1
-[0xa1,0x0e,0x10,0xd5]
-# CHECK:    msr DBGBCR14_EL1, x1
-[0xa3,0x0f,0x30,0xd5]
-# CHECK:    mrs x3, DBGBCR15_EL1
-[0xa1,0x0f,0x10,0xd5]
-# CHECK:    msr DBGBCR15_EL1, x1
-
-[0xc3,0x00,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR0_EL1
-[0xc1,0x00,0x10,0xd5]
-# CHECK:    msr DBGWVR0_EL1, x1
-[0xc3,0x01,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR1_EL1
-[0xc1,0x01,0x10,0xd5]
-# CHECK:    msr DBGWVR1_EL1, x1
-[0xc3,0x02,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR2_EL1
-[0xc1,0x02,0x10,0xd5]
-# CHECK:    msr DBGWVR2_EL1, x1
-[0xc3,0x03,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR3_EL1
-[0xc1,0x03,0x10,0xd5]
-# CHECK:    msr DBGWVR3_EL1, x1
-[0xc3,0x04,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR4_EL1
-[0xc1,0x04,0x10,0xd5]
-# CHECK:    msr DBGWVR4_EL1, x1
-[0xc3,0x05,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR5_EL1
-[0xc1,0x05,0x10,0xd5]
-# CHECK:    msr DBGWVR5_EL1, x1
-[0xc3,0x06,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR6_EL1
-[0xc1,0x06,0x10,0xd5]
-# CHECK:    msr DBGWVR6_EL1, x1
-[0xc3,0x07,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR7_EL1
-[0xc1,0x07,0x10,0xd5]
-# CHECK:    msr DBGWVR7_EL1, x1
-[0xc3,0x08,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR8_EL1
-[0xc1,0x08,0x10,0xd5]
-# CHECK:    msr DBGWVR8_EL1, x1
-[0xc3,0x09,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR9_EL1
-[0xc1,0x09,0x10,0xd5]
-# CHECK:    msr DBGWVR9_EL1, x1
-[0xc3,0x0a,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR10_EL1
-[0xc1,0x0a,0x10,0xd5]
-# CHECK:    msr DBGWVR10_EL1, x1
-[0xc3,0x0b,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR11_EL1
-[0xc1,0x0b,0x10,0xd5]
-# CHECK:    msr DBGWVR11_EL1, x1
-[0xc3,0x0c,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR12_EL1
-[0xc1,0x0c,0x10,0xd5]
-# CHECK:    msr DBGWVR12_EL1, x1
-[0xc3,0x0d,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR13_EL1
-[0xc1,0x0d,0x10,0xd5]
-# CHECK:    msr DBGWVR13_EL1, x1
-[0xc3,0x0e,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR14_EL1
-[0xc1,0x0e,0x10,0xd5]
-# CHECK:    msr DBGWVR14_EL1, x1
-[0xc3,0x0f,0x30,0xd5]
-# CHECK:    mrs x3, DBGWVR15_EL1
-[0xc1,0x0f,0x10,0xd5]
-# CHECK:    msr DBGWVR15_EL1, x1
-
-[0xe3,0x00,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR0_EL1
-[0xe1,0x00,0x10,0xd5]
-# CHECK:    msr DBGWCR0_EL1, x1
-[0xe3,0x01,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR1_EL1
-[0xe1,0x01,0x10,0xd5]
-# CHECK:    msr DBGWCR1_EL1, x1
-[0xe3,0x02,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR2_EL1
-[0xe1,0x02,0x10,0xd5]
-# CHECK:    msr DBGWCR2_EL1, x1
-[0xe3,0x03,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR3_EL1
-[0xe1,0x03,0x10,0xd5]
-# CHECK:    msr DBGWCR3_EL1, x1
-[0xe3,0x04,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR4_EL1
-[0xe1,0x04,0x10,0xd5]
-# CHECK:    msr DBGWCR4_EL1, x1
-[0xe3,0x05,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR5_EL1
-[0xe1,0x05,0x10,0xd5]
-# CHECK:    msr DBGWCR5_EL1, x1
-[0xe3,0x06,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR6_EL1
-[0xe1,0x06,0x10,0xd5]
-# CHECK:    msr DBGWCR6_EL1, x1
-[0xe3,0x07,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR7_EL1
-[0xe1,0x07,0x10,0xd5]
-# CHECK:    msr DBGWCR7_EL1, x1
-[0xe3,0x08,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR8_EL1
-[0xe1,0x08,0x10,0xd5]
-# CHECK:    msr DBGWCR8_EL1, x1
-[0xe3,0x09,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR9_EL1
-[0xe1,0x09,0x10,0xd5]
-# CHECK:    msr DBGWCR9_EL1, x1
-[0xe3,0x0a,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR10_EL1
-[0xe1,0x0a,0x10,0xd5]
-# CHECK:    msr DBGWCR10_EL1, x1
-[0xe3,0x0b,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR11_EL1
-[0xe1,0x0b,0x10,0xd5]
-# CHECK:    msr DBGWCR11_EL1, x1
-[0xe3,0x0c,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR12_EL1
-[0xe1,0x0c,0x10,0xd5]
-# CHECK:    msr DBGWCR12_EL1, x1
-[0xe3,0x0d,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR13_EL1
-[0xe1,0x0d,0x10,0xd5]
-# CHECK:    msr DBGWCR13_EL1, x1
-[0xe3,0x0e,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR14_EL1
-[0xe1,0x0e,0x10,0xd5]
-# CHECK:    msr DBGWCR14_EL1, x1
-[0xe3,0x0f,0x30,0xd5]
-# CHECK:    mrs x3, DBGWCR15_EL1
-[0xe1,0x0f,0x10,0xd5]
-# CHECK:    msr DBGWCR15_EL1, x1
-
-[0x43,0x04,0x30,0xd5]
-# CHECK:	mrs	x3, MDSELR_EL1
-[0x41,0x04,0x10,0xd5]
-# CHECK:	msr	MDSELR_EL1, x1
-
-[0x83,0x9e,0x38,0xd5]
-# CHECK:	mrs	x3, PMUACR_EL1
-[0x81,0x9e,0x18,0xd5]
-# CHECK:	msr	PMUACR_EL1, x1
-
-[0xe3,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMCCNTSVR_EL1
-[0x03,0xec,0x30,0xd5]
-# CHECK:	mrs	x3, PMICNTSVR_EL1
-[0x63,0x9d,0x38,0xd5]
-# CHECK:	mrs	x3, PMSSCR_EL1
-[0x61,0x9d,0x18,0xd5]
-# CHECK:	msr	PMSSCR_EL1, x1
-[0x03,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR0_EL1
-[0x23,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR1_EL1
-[0x43,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR2_EL1
-[0x63,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR3_EL1
-[0x83,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR4_EL1
-[0xa3,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR5_EL1
-[0xc3,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR6_EL1
-[0xe3,0xe8,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR7_EL1
-[0x03,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR8_EL1
-[0x23,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR9_EL1
-[0x43,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR10_EL1
-[0x63,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR11_EL1
-[0x83,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR12_EL1
-[0xa3,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR13_EL1
-[0xc3,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR14_EL1
-[0xe3,0xe9,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR15_EL1
-[0x03,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR16_EL1
-[0x23,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR17_EL1
-[0x43,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR18_EL1
-[0x63,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR19_EL1
-[0x83,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR20_EL1
-[0xa3,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR21_EL1
-[0xc3,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR22_EL1
-[0xe3,0xea,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR23_EL1
-[0x03,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR24_EL1
-[0x23,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR25_EL1
-[0x43,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR26_EL1
-[0x63,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR27_EL1
-[0x83,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR28_EL1
-[0xa3,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR29_EL1
-[0xc3,0xeb,0x30,0xd5]
-# CHECK:	mrs	x3, PMEVCNTSVR30_EL1
-
-[0x03,0x94,0x3b,0xd5]
-# CHECK:  mrs x3, PMICNTR_EL0
-[0x03,0x94,0x1b,0xd5]
-# CHECK:  msr PMICNTR_EL0, x3
-[0x03,0x96,0x3b,0xd5]
-# CHECK:  mrs x3, PMICFILTR_EL0
-[0x03,0x96,0x1b,0xd5]
-# CHECK:  msr PMICFILTR_EL0, x3
-
-[0x83,0x9d,0x1b,0xd5]
-# CHECK:  msr PMZR_EL0, x3
-
-[0xa3,0x9e,0x38,0xd5]
-# CHECK:	mrs	x3, PMECR_EL1
-[0xa1,0x9e,0x18,0xd5]
-# CHECK:	msr	PMECR_EL1, x1
-[0xe3,0x9e,0x38,0xd5]
-# CHECK:	mrs	x3, PMIAR_EL1
-[0xe1,0x9e,0x18,0xd5]
-# CHECK:	msr	PMIAR_EL1, x1
-
-[0x63,0x9d,0x30,0xd5]
-# CHECK:	mrs	x3, SPMACCESSR_EL1
-[0x61,0x9d,0x10,0xd5]
-# CHECK:	msr	SPMACCESSR_EL1, x1
-[0x63,0x9d,0x35,0xd5]
-# CHECK:	mrs	x3, SPMACCESSR_EL12
-[0x61,0x9d,0x15,0xd5]
-# CHECK:	msr	SPMACCESSR_EL12, x1
-[0x63,0x9d,0x34,0xd5]
-# CHECK:	mrs	x3, SPMACCESSR_EL2
-[0x61,0x9d,0x14,0xd5]
-# CHECK:	msr	SPMACCESSR_EL2, x1
-[0x63,0x9d,0x36,0xd5]
-# CHECK:	mrs	x3, SPMACCESSR_EL3
-[0x61,0x9d,0x16,0xd5]
-# CHECK:	msr	SPMACCESSR_EL3, x1
-[0x43,0x9c,0x33,0xd5]
-# CHECK:	mrs	x3, SPMCNTENCLR_EL0
-[0x41,0x9c,0x13,0xd5]
-# CHECK:	msr	SPMCNTENCLR_EL0, x1
-[0x23,0x9c,0x33,0xd5]
-# CHECK:	mrs	x3, SPMCNTENSET_EL0
-[0x21,0x9c,0x13,0xd5]
-# CHECK:	msr	SPMCNTENSET_EL0, x1
-[0x03,0x9c,0x33,0xd5]
-# CHECK:	mrs	x3, SPMCR_EL0
-[0x01,0x9c,0x13,0xd5]
-# CHECK:	msr	SPMCR_EL0, x1
-[0xc3,0x9d,0x30,0xd5]
-# CHECK:	mrs	x3, SPMDEVAFF_EL1
-[0xa3,0x9d,0x30,0xd5]
-# CHECK:	mrs	x3, SPMDEVARCH_EL1
-
-[0x03,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR0_EL0
-[0x01,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR0_EL0, x1
-[0x23,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR1_EL0
-[0x21,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR1_EL0, x1
-[0x43,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR2_EL0
-[0x41,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR2_EL0, x1
-[0x63,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR3_EL0
-[0x61,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR3_EL0, x1
-[0x83,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR4_EL0
-[0x81,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR4_EL0, x1
-[0xa3,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR5_EL0
-[0xa1,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR5_EL0, x1
-[0xc3,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR6_EL0
-[0xc1,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR6_EL0, x1
-[0xe3,0xe0,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR7_EL0
-[0xe1,0xe0,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR7_EL0, x1
-[0x03,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR8_EL0
-[0x01,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR8_EL0, x1
-[0x23,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR9_EL0
-[0x21,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR9_EL0, x1
-[0x43,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR10_EL0
-[0x41,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR10_EL0, x1
-[0x63,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR11_EL0
-[0x61,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR11_EL0, x1
-[0x83,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR12_EL0
-[0x81,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR12_EL0, x1
-[0xa3,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR13_EL0
-[0xa1,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR13_EL0, x1
-[0xc3,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR14_EL0
-[0xc1,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR14_EL0, x1
-[0xe3,0xe1,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVCNTR15_EL0
-[0xe1,0xe1,0x13,0xd5]
-# CHECK:	msr	SPMEVCNTR15_EL0, x1
-
-[0x03,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R0_EL0
-[0x01,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R0_EL0, x1
-[0x23,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R1_EL0
-[0x21,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R1_EL0, x1
-[0x43,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R2_EL0
-[0x41,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R2_EL0, x1
-[0x63,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R3_EL0
-[0x61,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R3_EL0, x1
-[0x83,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R4_EL0
-[0x81,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R4_EL0, x1
-[0xa3,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R5_EL0
-[0xa1,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R5_EL0, x1
-[0xc3,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R6_EL0
-[0xc1,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R6_EL0, x1
-[0xe3,0xe6,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R7_EL0
-[0xe1,0xe6,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R7_EL0, x1
-[0x03,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R8_EL0
-[0x01,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R8_EL0, x1
-[0x23,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R9_EL0
-[0x21,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R9_EL0, x1
-[0x43,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R10_EL0
-[0x41,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R10_EL0, x1
-[0x63,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R11_EL0
-[0x61,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R11_EL0, x1
-[0x83,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R12_EL0
-[0x81,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R12_EL0, x1
-[0xa3,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R13_EL0
-[0xa1,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R13_EL0, x1
-[0xc3,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R14_EL0
-[0xc1,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R14_EL0, x1
-[0xe3,0xe7,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILT2R15_EL0
-[0xe1,0xe7,0x13,0xd5]
-# CHECK:	msr	SPMEVFILT2R15_EL0, x1
-
-[0x03,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR0_EL0
-[0x01,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR0_EL0, x1
-[0x23,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR1_EL0
-[0x21,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR1_EL0, x1
-[0x43,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR2_EL0
-[0x41,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR2_EL0, x1
-[0x63,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR3_EL0
-[0x61,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR3_EL0, x1
-[0x83,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR4_EL0
-[0x81,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR4_EL0, x1
-[0xa3,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR5_EL0
-[0xa1,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR5_EL0, x1
-[0xc3,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR6_EL0
-[0xc1,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR6_EL0, x1
-[0xe3,0xe4,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR7_EL0
-[0xe1,0xe4,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR7_EL0, x1
-[0x03,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR8_EL0
-[0x01,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR8_EL0, x1
-[0x23,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR9_EL0
-[0x21,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR9_EL0, x1
-[0x43,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR10_EL0
-[0x41,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR10_EL0, x1
-[0x63,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR11_EL0
-[0x61,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR11_EL0, x1
-[0x83,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR12_EL0
-[0x81,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR12_EL0, x1
-[0xa3,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR13_EL0
-[0xa1,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR13_EL0, x1
-[0xc3,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR14_EL0
-[0xc1,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR14_EL0, x1
-[0xe3,0xe5,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVFILTR15_EL0
-[0xe1,0xe5,0x13,0xd5]
-# CHECK:	msr	SPMEVFILTR15_EL0, x1
-
-[0x03,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER0_EL0
-[0x01,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER0_EL0, x1
-[0x23,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER1_EL0
-[0x21,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER1_EL0, x1
-[0x43,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER2_EL0
-[0x41,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER2_EL0, x1
-[0x63,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER3_EL0
-[0x61,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER3_EL0, x1
-[0x83,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER4_EL0
-[0x81,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER4_EL0, x1
-[0xa3,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER5_EL0
-[0xa1,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER5_EL0, x1
-[0xc3,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER6_EL0
-[0xc1,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER6_EL0, x1
-[0xe3,0xe2,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER7_EL0
-[0xe1,0xe2,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER7_EL0, x1
-[0x03,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER8_EL0
-[0x01,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER8_EL0, x1
-[0x23,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER9_EL0
-[0x21,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER9_EL0, x1
-[0x43,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER10_EL0
-[0x41,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER10_EL0, x1
-[0x63,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER11_EL0
-[0x61,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER11_EL0, x1
-[0x83,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER12_EL0
-[0x81,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER12_EL0, x1
-[0xa3,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER13_EL0
-[0xa1,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER13_EL0, x1
-[0xc3,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER14_EL0
-[0xc1,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER14_EL0, x1
-[0xe3,0xe3,0x33,0xd5]
-# CHECK:	mrs	x3, SPMEVTYPER15_EL0
-[0xe1,0xe3,0x13,0xd5]
-# CHECK:	msr	SPMEVTYPER15_EL0, x1
-
-[0x83,0x9d,0x30,0xd5]
-# CHECK:	mrs	x3, SPMIIDR_EL1
-[0x43,0x9e,0x30,0xd5]
-# CHECK:	mrs	x3, SPMINTENCLR_EL1
-[0x41,0x9e,0x10,0xd5]
-# CHECK:	msr	SPMINTENCLR_EL1, x1
-[0x23,0x9e,0x30,0xd5]
-# CHECK:	mrs	x3, SPMINTENSET_EL1
-[0x21,0x9e,0x10,0xd5]
-# CHECK:	msr	SPMINTENSET_EL1, x1
-[0x63,0x9c,0x33,0xd5]
-# CHECK:	mrs	x3, SPMOVSCLR_EL0
-[0x61,0x9c,0x13,0xd5]
-# CHECK:	msr	SPMOVSCLR_EL0, x1
-[0x63,0x9e,0x33,0xd5]
-# CHECK:	mrs	x3, SPMOVSSET_EL0
-[0x61,0x9e,0x13,0xd5]
-# CHECK:	msr	SPMOVSSET_EL0, x1
-[0xa3,0x9c,0x33,0xd5]
-# CHECK:	mrs	x3, SPMSELR_EL0
-[0xa1,0x9c,0x13,0xd5]
-# CHECK:	msr	SPMSELR_EL0, x1
-[0x03,0x9d,0x30,0xd5]
-# CHECK:  mrs x3, SPMCGCR0_EL1
-[0x23,0x9d,0x30,0xd5]
-# CHECK:  mrs x3, SPMCGCR1_EL1
-[0xe3,0x9d,0x30,0xd5]
-# CHECK:  mrs x3, SPMCFGR_EL1
-[0xe3,0x9e,0x36,0xd5]
-# CHECK:  mrs x3, SPMROOTCR_EL3
-[0xe3,0x9e,0x16,0xd5]
-# CHECK:  msr SPMROOTCR_EL3, x3
-[0xe3,0x9e,0x37,0xd5]
-# CHECK:  mrs x3, SPMSCR_EL1
-[0xe3,0x9e,0x17,0xd5]
-# CHECK:  msr SPMSCR_EL1, x3
-
-[0x63,0x12,0x38,0xd5]
-# CHECK:	mrs	x3, TRCITECR_EL1
-# ERROR-NO-ITE:    mrs x3, S3_0_C1_C2_3
-[0x61,0x12,0x18,0xd5]
-# CHECK:	msr	TRCITECR_EL1, x1
-# ERROR-NO-ITE:    msr S3_0_C1_C2_3, x1
-[0x63,0x12,0x3d,0xd5]
-# CHECK:	mrs	x3, TRCITECR_EL12
-# ERROR-NO-ITE:    mrs x3, S3_5_C1_C2_3
-[0x61,0x12,0x1d,0xd5]
-# CHECK:	msr	TRCITECR_EL12, x1
-# ERROR-NO-ITE:    msr S3_5_C1_C2_3, x1
-[0x63,0x12,0x3c,0xd5]
-# CHECK:	mrs	x3, TRCITECR_EL2
-# ERROR-NO-ITE:    mrs x3, S3_4_C1_C2_3
-[0x61,0x12,0x1c,0xd5]
-# CHECK:	msr	TRCITECR_EL2, x1
-# ERROR-NO-ITE:    msr S3_4_C1_C2_3, x1
-[0xe1,0x72,0x0b,0xd5]
-# CHECK:    trcit x1
-# ERROR-NO-ITE:    sys #3, c7, c2, #7, x1
-
-[0x83,0x9a,0x38,0xd5]
-# CHECK:    mrs x3, PMSDSFR_EL1
-[0x83,0x9a,0x18,0xd5]
-# CHECK:    msr PMSDSFR_EL1, x3
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-lrcpc3.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-lrcpc3.txt
deleted file mode 100644
index 644e032..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-lrcpc3.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-# RUN: llvm-mc -triple aarch64-none-linux-gnu -disassemble -show-encoding               -mattr=+rcpc3 < %s | FileCheck %s
-# RUN: llvm-mc -triple aarch64-none-linux-gnu -disassemble -show-encoding -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck %s
-# RUN: llvm-mc -triple aarch64-none-linux-gnu -disassemble -show-encoding -mattr=+v9.4a -mattr=+rcpc3 < %s | FileCheck %s
-
-# RUN: not llvm-mc -triple aarch64-none-linux-gnu -disassemble               < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-# RUN: not llvm-mc -triple aarch64-none-linux-gnu -disassemble -mattr=+v8.9a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-# RUN: not llvm-mc -triple aarch64-none-linux-gnu -disassemble -mattr=+v9.4a < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-RCPC3 %s
-
-[0x18,0x0a,0x00,0x99]
-# CHECK:      stilp   w24, w0, [x16, #-8]!     // encoding: [0x18,0x0a,0x00,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x18,0x0a,0x00,0x99]
-# CHECK:      stilp   w24, w0, [x16, #-8]!     // encoding: [0x18,0x0a,0x00,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x39,0x0a,0x01,0xd9]
-# CHECK:      stilp   x25, x1, [x17, #-16]!    // encoding: [0x39,0x0a,0x01,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x39,0x0a,0x01,0xd9]
-# CHECK:      stilp   x25, x1, [x17, #-16]!    // encoding: [0x39,0x0a,0x01,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x5a,0x1a,0x02,0x99]
-# CHECK:      stilp   w26, w2, [x18]           // encoding: [0x5a,0x1a,0x02,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xfb,0x1b,0x03,0xd9]
-# CHECK:      stilp   x27, x3, [sp]            // encoding: [0xfb,0x1b,0x03,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x9c,0x0a,0x44,0x99]
-# CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x9c,0x0a,0x44,0x99]
-# CHECK:      ldiapp  w28, w4, [x20], #8       // encoding: [0x9c,0x0a,0x44,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xbd,0x0a,0x45,0xd9]
-# CHECK:      ldiapp  x29, x5, [x21], #16      // encoding: [0xbd,0x0a,0x45,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xbd,0x0a,0x45,0xd9]
-# CHECK:      ldiapp  x29, x5, [x21], #16      // encoding: [0xbd,0x0a,0x45,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xfe,0x1b,0x46,0x99]
-# CHECK:      ldiapp  w30, w6, [sp]            // encoding: [0xfe,0x1b,0x46,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xff,0x1a,0x47,0xd9]
-# CHECK:      ldiapp  xzr, x7, [x23]           // encoding: [0xff,0x1a,0x47,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0xe3,0x09,0x80,0x99]
-# CHECK:      stlr w3, [x15, #-4]!    // encoding: [0xe3,0x09,0x80,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x09,0x80,0x99]
-# CHECK:      stlr w3, [x15, #-4]!    // encoding: [0xe3,0x09,0x80,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x09,0x80,0xd9]
-# CHECK:      stlr x3, [x15, #-8]!    // encoding: [0xe3,0x09,0x80,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x0b,0x80,0xd9]
-# CHECK:      stlr x3, [sp, #-8]!     // encoding: [0xe3,0x0b,0x80,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x0b,0xc0,0x99]
-# CHECK:      ldapr w3, [sp], #4       // encoding: [0xe3,0x0b,0xc0,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x09,0xc0,0x99]
-# CHECK:      ldapr w3, [x15], #4      // encoding: [0xe3,0x09,0xc0,0x99]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x09,0xc0,0xd9]
-# CHECK:      ldapr x3, [x15], #8      // encoding: [0xe3,0x09,0xc0,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x09,0xc0,0xd9]
-# CHECK:      ldapr x3, [x15], #8      // encoding: [0xe3,0x09,0xc0,0xd9]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0xe3,0xf9,0x1f,0x1d]
-# CHECK:      stlur b3, [x15, #-1]  // encoding: [0xe3,0xf9,0x1f,0x1d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x29,0x00,0x5d]
-# CHECK:      stlur h3, [x15, #2]   // encoding: [0xe3,0x29,0x00,0x5d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0xd9,0x1f,0x9d]
-# CHECK:      stlur s3, [x15, #-3]  // encoding: [0xe3,0xd9,0x1f,0x9d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x4b,0x00,0xdd]
-# CHECK:      stlur d3, [sp, #4]    // encoding: [0xe3,0x4b,0x00,0xdd]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0xb9,0x9f,0x1d]
-# CHECK:      stlur q3, [x15, #-5]  // encoding: [0xe3,0xb9,0x9f,0x1d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x69,0x40,0x1d]
-# CHECK:      ldapur b3, [x15, #6]  // encoding: [0xe3,0x69,0x40,0x1d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x99,0x5f,0x5d]
-# CHECK:      ldapur h3, [x15, #-7] // encoding: [0xe3,0x99,0x5f,0x5d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x89,0x40,0x9d]
-# CHECK:      ldapur s3, [x15, #8]  // encoding: [0xe3,0x89,0x40,0x9d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x79,0x5f,0xdd]
-# CHECK:      ldapur d3, [x15, #-9] // encoding: [0xe3,0x79,0x5f,0xdd]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0xab,0xc0,0x1d]
-# CHECK:      ldapur q3, [sp, #10]  // encoding: [0xe3,0xab,0xc0,0x1d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0xe3,0x85,0x01,0x0d]
-# CHECK:      stl1  { v3.d }[0], [x15] // encoding: [0xe3,0x85,0x01,0x0d]
-# ERROR-NO-RCPC3:  [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x87,0x01,0x4d]
-# CHECK:      stl1  { v3.d }[1], [sp]  // encoding: [0xe3,0x87,0x01,0x4d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x87,0x41,0x0d]
-# CHECK:      ldap1 { v3.d }[0], [sp]  // encoding: [0xe3,0x87,0x41,0x0d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe3,0x85,0x41,0x4d]
-# CHECK:      ldap1 { v3.d }[1], [x15] // encoding: [0xe3,0x85,0x41,0x4d]
-# ERROR-NO-RCPC3: [[@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-specres2.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-specres2.txt
deleted file mode 100644
index a114cd3..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-specres2.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# FEAT_SPECRES2 is optional for all v8a/v9a, mandatory for 8.9a/9.4a.
-# Should disassemble to hint #22 if the feature is not present.
-# RUN: llvm-mc -triple=aarch64 -disassemble %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8a %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8.9a -mattr=-specres2 %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.3a %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.4a -mattr=-specres2 %s | FileCheck %s --check-prefix=HINT_22
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+specres2 %s | FileCheck %s --check-prefix=FEAT_SPECRES2
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8a -mattr=+specres2 %s | FileCheck %s --check-prefix=FEAT_SPECRES2
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v8.9a %s | FileCheck %s --check-prefix=FEAT_SPECRES2
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.3a -mattr=+specres2 %s | FileCheck %s --check-prefix=FEAT_SPECRES2
-# RUN: llvm-mc -triple=aarch64 -disassemble -mattr=+v9.4a %s | FileCheck %s --check-prefix=FEAT_SPECRES2
-
-[0xc0,0x73,0x0b,0xd5]
-# FEAT_SPECRES2: cosp rctx, x0
-# HINT_22: sys #3, c7, c3, #6, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.9a-the.txt b/llvm/test/MC/Disassembler/AArch64/armv8.9a-the.txt
deleted file mode 100644
index f3b313a..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv8.9a-the.txt
+++ /dev/null
@@ -1,482 +0,0 @@
-# RUN: llvm-mc -triple=aarch64                   -mattr=+the -mattr=+d128 -disassemble %s      | FileCheck %s
-# RUN: llvm-mc -triple=aarch64     -mattr=+v8.9a -mattr=+the -mattr=+d128 -disassemble %s      | FileCheck %s
-# RUN: llvm-mc -triple=aarch64     -mattr=+v9.4a -mattr=+the -mattr=+d128 -disassemble %s      | FileCheck %s
-# RUN: not llvm-mc -triple=aarch64                                          -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-THE
-# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.9a                            -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-THE
-# RUN: not llvm-mc -triple=aarch64 -mattr=+v9.4a                            -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-THE
-# RUN: not llvm-mc -triple=aarch64               -mattr=+the                -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.9a -mattr=+the                -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-# RUN: not llvm-mc -triple=aarch64 -mattr=+v9.4a -mattr=+the                -disassemble %s 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-[0xc3,0xd0,0x38,0xd5]
-# CHECK: mrs x3, RCWMASK_EL1
-[0xc1,0xd0,0x18,0xd5]
-# CHECK: msr RCWMASK_EL1, x1
-[0x63,0xd0,0x38,0xd5]
-# CHECK: mrs x3, RCWSMASK_EL1
-[0x61,0xd0,0x18,0xd5]
-# CHECK: msr RCWSMASK_EL1, x1
-
-[0x81,0x08,0x20,0x19]
-# CHECK: rcwcas   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0xa0,0x19]
-# CHECK: rcwcasa  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0xe0,0x19]
-# CHECK: rcwcasal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0x60,0x19]
-# CHECK: rcwcasl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0x23,0x19]
-# CHECK: rcwcas   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0xa3,0x19]
-# CHECK: rcwcasa  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0xe3,0x19]
-# CHECK: rcwcasal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0x63,0x19]
-# CHECK: rcwcasl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0x08,0x20,0x59]
-# CHECK: rcwscas   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0xa0,0x59]
-# CHECK: rcwscasa  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0xe0,0x59]
-# CHECK: rcwscasal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x08,0x60,0x59]
-# CHECK: rcwscasl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0x23,0x59]
-# CHECK: rcwscas   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0xa3,0x59]
-# CHECK: rcwscasa  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0xe3,0x59]
-# CHECK: rcwscasal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x0b,0x63,0x59]
-# CHECK: rcwscasl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x86,0x0c,0x20,0x19]
-# CHECK: rcwcasp   x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0xa0,0x19]
-# CHECK: rcwcaspa  x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0xe0,0x19]
-# CHECK: rcwcaspal x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0x60,0x19]
-# CHECK: rcwcaspl  x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0x24,0x19]
-# CHECK: rcwcasp   x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0xa4,0x19]
-# CHECK: rcwcaspa  x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0xe4,0x19]
-# CHECK: rcwcaspal x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0x64,0x19]
-# CHECK: rcwcaspl  x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x86,0x0c,0x20,0x59]
-# CHECK: rcwscasp   x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0xa0,0x59]
-# CHECK: rcwscaspa  x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0xe0,0x59]
-# CHECK: rcwscaspal x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x86,0x0c,0x60,0x59]
-# CHECK: rcwscaspl  x0, x1, x6, x7, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0x24,0x59]
-# CHECK: rcwscasp   x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0xa4,0x59]
-# CHECK: rcwscaspa  x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0xe4,0x59]
-# CHECK: rcwscaspal x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe6,0x0f,0x64,0x59]
-# CHECK: rcwscaspl  x4, x5, x6, x7, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0x90,0x20,0x38]
-# CHECK: rcwclr   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xa0,0x38]
-# CHECK: rcwclra  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xe0,0x38]
-# CHECK: rcwclral x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0x60,0x38]
-# CHECK: rcwclrl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x23,0x38]
-# CHECK: rcwclr   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xa3,0x38]
-# CHECK: rcwclra  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xe3,0x38]
-# CHECK: rcwclral x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x63,0x38]
-# CHECK: rcwclrl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0x90,0x20,0x78]
-# CHECK: rcwsclr   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xa0,0x78]
-# CHECK: rcwsclra  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xe0,0x78]
-# CHECK: rcwsclral x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0x90,0x60,0x78]
-# CHECK: rcwsclrl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x23,0x78]
-# CHECK: rcwsclr   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xa3,0x78]
-# CHECK: rcwsclra  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xe3,0x78]
-# CHECK: rcwsclral x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x63,0x78]
-# CHECK: rcwsclrl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0x90,0x20,0x19]
-# CHECK: rcwclrp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xa0,0x19]
-# CHECK: rcwclrpa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xe0,0x19]
-# CHECK: rcwclrpal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0x60,0x19]
-# CHECK: rcwclrpl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x23,0x19]
-# CHECK: rcwclrp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xa3,0x19]
-# CHECK: rcwclrpa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xe3,0x19]
-# CHECK: rcwclrpal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x63,0x19]
-# CHECK: rcwclrpl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0x90,0x20,0x59]
-# CHECK: rcwsclrp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xa0,0x59]
-# CHECK: rcwsclrpa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0xe0,0x59]
-# CHECK: rcwsclrpal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0x90,0x60,0x59]
-# CHECK: rcwsclrpl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x23,0x59]
-# CHECK: rcwsclrp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xa3,0x59]
-# CHECK: rcwsclrpa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0xe3,0x59]
-# CHECK: rcwsclrpal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0x93,0x63,0x59]
-# CHECK: rcwsclrpl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0xb0,0x20,0x38]
-# CHECK: rcwset   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xa0,0x38]
-# CHECK: rcwseta  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xe0,0x38]
-# CHECK: rcwsetal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0x60,0x38]
-# CHECK: rcwsetl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x23,0x38]
-# CHECK: rcwset   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xa3,0x38]
-# CHECK: rcwseta  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xe3,0x38]
-# CHECK: rcwsetal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x63,0x38]
-# CHECK: rcwsetl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0xb0,0x20,0x78]
-# CHECK: rcwsset   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xa0,0x78]
-# CHECK: rcwsseta  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xe0,0x78]
-# CHECK: rcwssetal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0x60,0x78]
-# CHECK: rcwssetl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x23,0x78]
-# CHECK: rcwsset   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xa3,0x78]
-# CHECK: rcwsseta  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xe3,0x78]
-# CHECK: rcwssetal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x63,0x78]
-# CHECK: rcwssetl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0xb0,0x20,0x19]
-# CHECK: rcwsetp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xa0,0x19]
-# CHECK: rcwsetpa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xe0,0x19]
-# CHECK: rcwsetpal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0x60,0x19]
-# CHECK: rcwsetpl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x23,0x19]
-# CHECK: rcwsetp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xa3,0x19]
-# CHECK: rcwsetpa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xe3,0x19]
-# CHECK: rcwsetpal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x63,0x19]
-# CHECK: rcwsetpl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0xb0,0x20,0x59]
-# CHECK: rcwssetp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xa0,0x59]
-# CHECK: rcwssetpa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0xe0,0x59]
-# CHECK: rcwssetpal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xb0,0x60,0x59]
-# CHECK: rcwssetpl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x23,0x59]
-# CHECK: rcwssetp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xa3,0x59]
-# CHECK: rcwssetpa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0xe3,0x59]
-# CHECK: rcwssetpal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xb3,0x63,0x59]
-# CHECK: rcwssetpl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0xa0,0x20,0x38]
-# CHECK: rcwswp   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xa0,0x38]
-# CHECK: rcwswpa  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xe0,0x38]
-# CHECK: rcwswpal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0x60,0x38]
-# CHECK: rcwswpl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x23,0x38]
-# CHECK: rcwswp   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xa3,0x38]
-# CHECK: rcwswpa  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xe3,0x38]
-# CHECK: rcwswpal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x63,0x38]
-# CHECK: rcwswpl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0xa0,0x20,0x78]
-# CHECK: rcwsswp   x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xa0,0x78]
-# CHECK: rcwsswpa  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xe0,0x78]
-# CHECK: rcwsswpal x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0x60,0x78]
-# CHECK: rcwsswpl  x0, x1, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x23,0x78]
-# CHECK: rcwsswp   x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xa3,0x78]
-# CHECK: rcwsswpa  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xe3,0x78]
-# CHECK: rcwsswpal x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x63,0x78]
-# CHECK: rcwsswpl  x3, x5, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-
-[0x81,0xa0,0x20,0x19]
-# CHECK: rcwswpp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xa0,0x19]
-# CHECK: rcwswppa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xe0,0x19]
-# CHECK: rcwswppal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0x60,0x19]
-# CHECK: rcwswppl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x23,0x19]
-# CHECK: rcwswpp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xa3,0x19]
-# CHECK: rcwswppa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xe3,0x19]
-# CHECK: rcwswppal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x63,0x19]
-# CHECK: rcwswppl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-
-[0x81,0xa0,0x20,0x59]
-# CHECK: rcwsswpp   x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xa0,0x59]
-# CHECK: rcwsswppa  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0xe0,0x59]
-# CHECK: rcwsswppal x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0x81,0xa0,0x60,0x59]
-# CHECK: rcwsswppl  x1, x0, [x4]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x23,0x59]
-# CHECK: rcwsswpp   x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xa3,0x59]
-# CHECK: rcwsswppa  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0xe3,0x59]
-# CHECK: rcwsswppal x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
-[0xe5,0xa3,0x63,0x59]
-# CHECK: rcwsswppl  x5, x3, [sp]
-# ERROR-NO-THE: [[@LINE-2]]:2: warning: invalid instruction encoding
-# ERROR-NO-D128: [[@LINE-3]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9-sysp.txt b/llvm/test/MC/Disassembler/AArch64/armv9-sysp.txt
deleted file mode 100644
index 2bbdef4..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9-sysp.txt
+++ /dev/null
@@ -1,562 +0,0 @@
-# RUN: llvm-mc -triple aarch64 --disassemble -mattr=+d128,+tlb-rmi,+xs %s -o - 2> %t | FileCheck %s
-# RUN: FileCheck %s --check-prefix=INVALID --input-file=%t
-
-# RUN: llvm-mc -triple aarch64 --disassemble %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-0x00 0x20 0x48 0xd5
-0x20 0x20 0x48 0xd5
-0x00 0x74 0x48 0xd5
-0x60 0xd0 0x48 0xd5
-0xc0 0xd0 0x48 0xd5
-0x00 0x20 0x4c 0xd5
-0x20 0x20 0x4c 0xd5
-0x00 0x21 0x4c 0xd5
-0x00 0x20 0x48 0xd5
-0x20 0x20 0x48 0xd5
-0x00 0x74 0x48 0xd5
-0x60 0xd0 0x48 0xd5
-0xc0 0xd0 0x48 0xd5
-0x00 0x20 0x4c 0xd5
-0x20 0x20 0x4c 0xd5
-0x00 0x21 0x4c 0xd5
-0x00 0x20 0x48 0xd5
-0x02 0x20 0x48 0xd5
-0x04 0x20 0x48 0xd5
-0x06 0x20 0x48 0xd5
-0x08 0x20 0x48 0xd5
-0x0a 0x20 0x48 0xd5
-0x0c 0x20 0x48 0xd5
-0x0e 0x20 0x48 0xd5
-0x10 0x20 0x48 0xd5
-0x12 0x20 0x48 0xd5
-0x14 0x20 0x48 0xd5
-0x16 0x20 0x48 0xd5
-0x18 0x20 0x48 0xd5
-0x1a 0x20 0x48 0xd5
-0x1c 0x20 0x48 0xd5
-0x1e 0x20 0x48 0xd5
-0x1f 0x20 0x48 0xd5
-
-# CHECK: sysp #0, c2, c0, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #1, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c7, c4, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c13, c0, #3, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c13, c0, #6, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c0, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c0, #1, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c1, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #1, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c7, c4, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c13, c0, #3, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c13, c0, #6, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c0, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c0, #1, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #4, c2, c1, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x0, x1
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x2, x3
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x6, x7
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x12, x13
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x16, x17
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x22, x23
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x26, x27
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0, x30, xzr
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK-NEXT: sysp #0, c2, c0, #0
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-
-
-0x01 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x1, x2
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x01 0x20 0x48 0xd5
-
-0x03 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x3, x4
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x03 0x20 0x48 0xd5
-
-0x05 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x5, x6
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x05 0x20 0x48 0xd5
-
-0x07 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x7, x8
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x07 0x20 0x48 0xd5
-
-0x09 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x9, x10
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x09 0x20 0x48 0xd5
-
-0x0b 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x11, x12
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x0b 0x20 0x48 0xd5
-
-0x0d 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x13, x14
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x0d 0x20 0x48 0xd5
-
-0x0f 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x15, x16
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x0f 0x20 0x48 0xd5
-
-0x11 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x17, x18
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x11 0x20 0x48 0xd5
-
-0x13 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x19, x20
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x13 0x20 0x48 0xd5
-
-0x15 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x21, x22
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x15 0x20 0x48 0xd5
-
-0x17 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x23, x24
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x17 0x20 0x48 0xd5
-
-0x19 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x25, x26
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x19 0x20 0x48 0xd5
-
-0x1b 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x27, x28
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x1b 0x20 0x48 0xd5
-
-0x1d 0x20 0x48 0xd5 # sysp #0, c2, c0, #0, x29, x30
-# INVALID: warning: invalid instruction encoding
-# INVALID-NEXT: 0x1d 0x20 0x48 0xd5
-
-0x24 0x80 0x4c 0xd5
-# CHECK: tlbip   ipas2e1is, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x24 0x90 0x4c 0xd5
-# CHECK: tlbip   ipas2e1isnxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x24 0x84 0x4c 0xd5
-# CHECK: tlbip   ipas2e1, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x24 0x94 0x4c 0xd5
-# CHECK: tlbip   ipas2e1nxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x04 0x84 0x4c 0xd5
-# CHECK: tlbip   ipas2e1os, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x04 0x94 0x4c 0xd5
-# CHECK: tlbip   ipas2e1osnxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa4 0x84 0x4c 0xd5
-# CHECK: tlbip   ipas2le1, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa4 0x94 0x4c 0xd5
-# CHECK: tlbip   ipas2le1nxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa4 0x80 0x4c 0xd5
-# CHECK: tlbip   ipas2le1is, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa4 0x90 0x4c 0xd5
-# CHECK: tlbip   ipas2le1isnxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x84 0x84 0x4c 0xd5
-# CHECK: tlbip   ipas2le1os, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x84 0x94 0x4c 0xd5
-# CHECK: tlbip   ipas2le1osnxs, x4, x5
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-
-0x28 0x83 0x48 0xd5
-# CHECK: tlbip   vae1is, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x93 0x48 0xd5
-# CHECK: tlbip   vae1isnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x87 0x48 0xd5
-# CHECK: tlbip   vae1, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x97 0x48 0xd5
-# CHECK: tlbip   vae1nxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x83 0x48 0xd5
-# CHECK: tlbip   vae1is, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x93 0x48 0xd5
-# CHECK: tlbip   vae1isnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x81 0x48 0xd5
-# CHECK: tlbip   vae1os, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x28 0x91 0x48 0xd5
-# CHECK: tlbip   vae1osnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x87 0x48 0xd5
-# CHECK: tlbip   vale1, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x97 0x48 0xd5
-# CHECK: tlbip   vale1nxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x83 0x48 0xd5
-# CHECK: tlbip   vale1is, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x93 0x48 0xd5
-# CHECK: tlbip   vale1isnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x81 0x48 0xd5
-# CHECK: tlbip   vale1os, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xa8 0x91 0x48 0xd5
-# CHECK: tlbip   vale1osnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x87 0x48 0xd5
-# CHECK: tlbip   vaae1, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x97 0x48 0xd5
-# CHECK: tlbip   vaae1nxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x83 0x48 0xd5
-# CHECK: tlbip   vaae1is, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x93 0x48 0xd5
-# CHECK: tlbip   vaae1isnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x81 0x48 0xd5
-# CHECK: tlbip   vaae1os, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x68 0x91 0x48 0xd5
-# CHECK: tlbip   vaae1osnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x87 0x48 0xd5
-# CHECK: tlbip   vaale1, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x97 0x48 0xd5
-# CHECK: tlbip   vaale1nxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x83 0x48 0xd5
-# CHECK: tlbip   vaale1is, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x93 0x48 0xd5
-# CHECK: tlbip   vaale1isnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x81 0x48 0xd5
-# CHECK: tlbip   vaale1os, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xe8 0x91 0x48 0xd5
-# CHECK: tlbip   vaale1osnxs, x8, x9
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-0x2e 0x87 0x4c 0xd5
-# CHECK: tlbip   vae2, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2e 0x97 0x4c 0xd5
-# CHECK: tlbip   vae2nxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2e 0x83 0x4c 0xd5
-# CHECK: tlbip   vae2is, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2e 0x93 0x4c 0xd5
-# CHECK: tlbip   vae2isnxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2e 0x81 0x4c 0xd5
-# CHECK: tlbip   vae2os, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2e 0x91 0x4c 0xd5
-# CHECK: tlbip   vae2osnxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x87 0x4c 0xd5
-# CHECK: tlbip   vale2, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x97 0x4c 0xd5
-# CHECK: tlbip   vale2nxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x83 0x4c 0xd5
-# CHECK: tlbip   vale2is, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x93 0x4c 0xd5
-# CHECK: tlbip   vale2isnxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x81 0x4c 0xd5
-# CHECK: tlbip   vale2os, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xae 0x91 0x4c 0xd5
-# CHECK: tlbip   vale2osnxs, x14, x15
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-0x38 0x87 0x4e 0xd5
-# CHECK: tlbip   vae3, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x38 0x97 0x4e 0xd5
-# CHECK: tlbip   vae3nxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x38 0x83 0x4e 0xd5
-# CHECK: tlbip   vae3is, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x38 0x93 0x4e 0xd5
-# CHECK: tlbip   vae3isnxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x38 0x81 0x4e 0xd5
-# CHECK: tlbip   vae3os, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x38 0x91 0x4e 0xd5
-# CHECK: tlbip   vae3osnxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x87 0x4e 0xd5
-# CHECK: tlbip   vale3, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x97 0x4e 0xd5
-# CHECK: tlbip   vale3nxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x83 0x4e 0xd5
-# CHECK: tlbip   vale3is, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x93 0x4e 0xd5
-# CHECK: tlbip   vale3isnxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x81 0x4e 0xd5
-# CHECK: tlbip   vale3os, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb8 0x91 0x4e 0xd5
-# CHECK: tlbip   vale3osnxs, x24, x25
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-
-0x32 0x86 0x48 0xd5
-# CHECK: tlbip   rvae1, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x96 0x48 0xd5
-# CHECK: tlbip   rvae1nxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x86 0x48 0xd5
-# CHECK: tlbip   rvae1, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x96 0x48 0xd5
-# CHECK: tlbip   rvae1nxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x82 0x48 0xd5
-# CHECK: tlbip   rvae1is, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x92 0x48 0xd5
-# CHECK: tlbip   rvae1isnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x85 0x48 0xd5
-# CHECK: tlbip   rvae1os, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x32 0x95 0x48 0xd5
-# CHECK: tlbip   rvae1osnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x86 0x48 0xd5
-# CHECK: tlbip   rvaae1, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x96 0x48 0xd5
-# CHECK: tlbip   rvaae1nxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x82 0x48 0xd5
-# CHECK: tlbip   rvaae1is, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x92 0x48 0xd5
-# CHECK: tlbip   rvaae1isnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x85 0x48 0xd5
-# CHECK: tlbip   rvaae1os, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x72 0x95 0x48 0xd5
-# CHECK: tlbip   rvaae1osnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x86 0x48 0xd5
-# CHECK: tlbip   rvale1, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x96 0x48 0xd5
-# CHECK: tlbip   rvale1nxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x82 0x48 0xd5
-# CHECK: tlbip   rvale1is, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x92 0x48 0xd5
-# CHECK: tlbip   rvale1isnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x85 0x48 0xd5
-# CHECK: tlbip   rvale1os, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xb2 0x95 0x48 0xd5
-# CHECK: tlbip   rvale1osnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x86 0x48 0xd5
-# CHECK: tlbip   rvaale1, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x96 0x48 0xd5
-# CHECK: tlbip   rvaale1nxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x82 0x48 0xd5
-# CHECK: tlbip   rvaale1is, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x92 0x48 0xd5
-# CHECK: tlbip   rvaale1isnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x85 0x48 0xd5
-# CHECK: tlbip   rvaale1os, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf2 0x95 0x48 0xd5
-# CHECK: tlbip   rvaale1osnxs, x18, x19
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-0x3c 0x86 0x4c 0xd5
-# CHECK: tlbip   rvae2, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x3c 0x96 0x4c 0xd5
-# CHECK: tlbip   rvae2nxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x3c 0x82 0x4c 0xd5
-# CHECK: tlbip   rvae2is, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x3c 0x92 0x4c 0xd5
-# CHECK: tlbip   rvae2isnxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x3c 0x85 0x4c 0xd5
-# CHECK: tlbip   rvae2os, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x3c 0x95 0x4c 0xd5
-# CHECK: tlbip   rvae2osnxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x86 0x4c 0xd5
-# CHECK: tlbip   rvale2, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x96 0x4c 0xd5
-# CHECK: tlbip   rvale2nxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x82 0x4c 0xd5
-# CHECK: tlbip   rvale2is, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x92 0x4c 0xd5
-# CHECK: tlbip   rvale2isnxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x85 0x4c 0xd5
-# CHECK: tlbip   rvale2os, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xbc 0x95 0x4c 0xd5
-# CHECK: tlbip   rvale2osnxs, x28, x29
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-0x2a 0x86 0x4e 0xd5
-# CHECK: tlbip   rvae3, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2a 0x96 0x4e 0xd5
-# CHECK: tlbip   rvae3nxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2a 0x82 0x4e 0xd5
-# CHECK: tlbip   rvae3is, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2a 0x92 0x4e 0xd5
-# CHECK: tlbip   rvae3isnxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2a 0x85 0x4e 0xd5
-# CHECK: tlbip   rvae3os, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x2a 0x95 0x4e 0xd5
-# CHECK: tlbip   rvae3osnxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x86 0x4e 0xd5
-# CHECK: tlbip   rvale3, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x96 0x4e 0xd5
-# CHECK: tlbip   rvale3nxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x82 0x4e 0xd5
-# CHECK: tlbip   rvale3is, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x92 0x4e 0xd5
-# CHECK: tlbip   rvale3isnxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x85 0x4e 0xd5
-# CHECK: tlbip   rvale3os, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xaa 0x95 0x4e 0xd5
-# CHECK: tlbip   rvale3osnxs, x10, x11
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-
-0x54 0x80 0x4c 0xd5
-# CHECK: tlbip   ripas2e1is, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x54 0x90 0x4c 0xd5
-# CHECK: tlbip   ripas2e1isnxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x54 0x84 0x4c 0xd5
-# CHECK: tlbip   ripas2e1, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x54 0x94 0x4c 0xd5
-# CHECK: tlbip   ripas2e1nxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x54 0x80 0x4c 0xd5
-# CHECK: tlbip   ripas2e1is, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x54 0x90 0x4c 0xd5
-# CHECK: tlbip   ripas2e1isnxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x74 0x84 0x4c 0xd5
-# CHECK: tlbip   ripas2e1os, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0x74 0x94 0x4c 0xd5
-# CHECK: tlbip   ripas2e1osnxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xd4 0x84 0x4c 0xd5
-# CHECK: tlbip   ripas2le1, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xd4 0x94 0x4c 0xd5
-# CHECK: tlbip   ripas2le1nxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xd4 0x80 0x4c 0xd5
-# CHECK: tlbip   ripas2le1is, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xd4 0x90 0x4c 0xd5
-# CHECK: tlbip   ripas2le1isnxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf4 0x84 0x4c 0xd5
-# CHECK: tlbip   ripas2le1os, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-0xf4 0x94 0x4c 0xd5
-# CHECK: tlbip   ripas2le1osnxs, x20, x21
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK: tlbip ripas2le1os, xzr, xzr
-0xff 0x84 0x4c 0xd5
-# ERROR-NO-D128: warning: invalid instruction encoding
-# CHECK: tlbip ripas2le1osnxs, xzr, xzr
-0xff 0x94 0x4c 0xd5
-# ERROR-NO-D128: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9-sysreg128.txt b/llvm/test/MC/Disassembler/AArch64/armv9-sysreg128.txt
deleted file mode 100644
index 4ab37a0..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9-sysreg128.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -mattr=+d128                   --disassemble -show-encoding %s -o - | FileCheck %s --check-prefix=WITHOUT
-# RUN: llvm-mc -triple aarch64 -mattr=+d128,+the,+el2vmsa,+vh --disassemble -show-encoding %s -o - | FileCheck %s --check-prefix=W_FEATS
-
-# RUN: llvm-mc -triple aarch64 --disassemble -show-encoding %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR-NO-D128
-
-0x00,0x20,0x78,0xd5
-0x20,0x20,0x78,0xd5
-0x00,0x74,0x78,0xd5
-0x60,0xd0,0x78,0xd5
-0xc0,0xd0,0x78,0xd5
-0x00,0x20,0x7c,0xd5
-0x20,0x20,0x7c,0xd5
-0x00,0x21,0x7c,0xd5
-0x00,0x21,0x7c,0xd5
-0x02,0x21,0x7c,0xd5
-0x04,0x21,0x7c,0xd5
-0x06,0x21,0x7c,0xd5
-0x08,0x21,0x7c,0xd5
-0x0a,0x21,0x7c,0xd5
-0x0c,0x21,0x7c,0xd5
-0x0e,0x21,0x7c,0xd5
-0x10,0x21,0x7c,0xd5
-0x12,0x21,0x7c,0xd5
-0x14,0x21,0x7c,0xd5
-0x16,0x21,0x7c,0xd5
-0x18,0x21,0x7c,0xd5
-0x1a,0x21,0x7c,0xd5
-
-# WITHOUT: mrrs	x0, x1, TTBR0_EL1               // encoding: [0x00,0x20,0x78,0xd5]
-# WITHOUT: mrrs	x0, x1, TTBR1_EL1               // encoding: [0x20,0x20,0x78,0xd5]
-# WITHOUT: mrrs	x0, x1, PAR_EL1                 // encoding: [0x00,0x74,0x78,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_0_C13_C0_3           // encoding: [0x60,0xd0,0x78,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_0_C13_C0_6           // encoding: [0xc0,0xd0,0x78,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_4_C2_C0_0            // encoding: [0x00,0x20,0x7c,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_4_C2_C0_1            // encoding: [0x20,0x20,0x7c,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_4_C2_C1_0            // encoding: [0x00,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x0, x1, S3_4_C2_C1_0            // encoding: [0x00,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x2, x3, S3_4_C2_C1_0            // encoding: [0x02,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x4, x5, S3_4_C2_C1_0            // encoding: [0x04,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x6, x7, S3_4_C2_C1_0            // encoding: [0x06,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x8, x9, S3_4_C2_C1_0            // encoding: [0x08,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x10, x11, S3_4_C2_C1_0          // encoding: [0x0a,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x12, x13, S3_4_C2_C1_0          // encoding: [0x0c,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x14, x15, S3_4_C2_C1_0          // encoding: [0x0e,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x16, x17, S3_4_C2_C1_0          // encoding: [0x10,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x18, x19, S3_4_C2_C1_0          // encoding: [0x12,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x20, x21, S3_4_C2_C1_0          // encoding: [0x14,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x22, x23, S3_4_C2_C1_0          // encoding: [0x16,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x24, x25, S3_4_C2_C1_0          // encoding: [0x18,0x21,0x7c,0xd5]
-# WITHOUT: mrrs	x26, x27, S3_4_C2_C1_0          // encoding: [0x1a,0x21,0x7c,0xd5]
-
-# W_FEATS: mrrs	x0, x1, TTBR0_EL1               // encoding: [0x00,0x20,0x78,0xd5]
-# W_FEATS: mrrs	x0, x1, TTBR1_EL1               // encoding: [0x20,0x20,0x78,0xd5]
-# W_FEATS: mrrs	x0, x1, PAR_EL1                 // encoding: [0x00,0x74,0x78,0xd5]
-# W_FEATS: mrrs	x0, x1, RCWSMASK_EL1            // encoding: [0x60,0xd0,0x78,0xd5]
-# W_FEATS: mrrs	x0, x1, RCWMASK_EL1             // encoding: [0xc0,0xd0,0x78,0xd5]
-# W_FEATS: mrrs	x0, x1, TTBR0_EL2               // encoding: [0x00,0x20,0x7c,0xd5]
-# W_FEATS: mrrs	x0, x1, TTBR1_EL2               // encoding: [0x20,0x20,0x7c,0xd5]
-# W_FEATS: mrrs	x0, x1, VTTBR_EL2               // encoding: [0x00,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x0, x1, VTTBR_EL2               // encoding: [0x00,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x2, x3, VTTBR_EL2               // encoding: [0x02,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x4, x5, VTTBR_EL2               // encoding: [0x04,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x6, x7, VTTBR_EL2               // encoding: [0x06,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x8, x9, VTTBR_EL2               // encoding: [0x08,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x10, x11, VTTBR_EL2             // encoding: [0x0a,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x12, x13, VTTBR_EL2             // encoding: [0x0c,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x14, x15, VTTBR_EL2             // encoding: [0x0e,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x16, x17, VTTBR_EL2             // encoding: [0x10,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x18, x19, VTTBR_EL2             // encoding: [0x12,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x20, x21, VTTBR_EL2             // encoding: [0x14,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x22, x23, VTTBR_EL2             // encoding: [0x16,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x24, x25, VTTBR_EL2             // encoding: [0x18,0x21,0x7c,0xd5]
-# W_FEATS: mrrs	x26, x27, VTTBR_EL2             // encoding: [0x1a,0x21,0x7c,0xd5]
-
-# ERROR-NO-D128: warning: invalid instruction encoding
-
-
-0x00,0x20,0x58,0xd5
-0x20,0x20,0x58,0xd5
-0x00,0x74,0x58,0xd5
-0x60,0xd0,0x58,0xd5
-0xc0,0xd0,0x58,0xd5
-0x00,0x20,0x5c,0xd5
-0x20,0x20,0x5c,0xd5
-0x00,0x21,0x5c,0xd5
-0x00,0x21,0x5c,0xd5
-0x02,0x21,0x5c,0xd5
-0x04,0x21,0x5c,0xd5
-0x06,0x21,0x5c,0xd5
-0x08,0x21,0x5c,0xd5
-0x0a,0x21,0x5c,0xd5
-0x0c,0x21,0x5c,0xd5
-0x0e,0x21,0x5c,0xd5
-0x10,0x21,0x5c,0xd5
-0x12,0x21,0x5c,0xd5
-0x14,0x21,0x5c,0xd5
-0x16,0x21,0x5c,0xd5
-0x18,0x21,0x5c,0xd5
-0x1a,0x21,0x5c,0xd5
-
-# WITHOUT: msrr	TTBR0_EL1, x0, x1               // encoding: [0x00,0x20,0x58,0xd5]
-# WITHOUT: msrr	TTBR1_EL1, x0, x1               // encoding: [0x20,0x20,0x58,0xd5]
-# WITHOUT: msrr	PAR_EL1, x0, x1                 // encoding: [0x00,0x74,0x58,0xd5]
-# WITHOUT: msrr	S3_0_C13_C0_3, x0, x1           // encoding: [0x60,0xd0,0x58,0xd5]
-# WITHOUT: msrr	S3_0_C13_C0_6, x0, x1           // encoding: [0xc0,0xd0,0x58,0xd5]
-# WITHOUT: msrr	S3_4_C2_C0_0, x0, x1            // encoding: [0x00,0x20,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C0_1, x0, x1            // encoding: [0x20,0x20,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x0, x1            // encoding: [0x00,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x0, x1            // encoding: [0x00,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x2, x3            // encoding: [0x02,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x4, x5            // encoding: [0x04,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x6, x7            // encoding: [0x06,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x8, x9            // encoding: [0x08,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x10, x11          // encoding: [0x0a,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x12, x13          // encoding: [0x0c,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x14, x15          // encoding: [0x0e,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x16, x17          // encoding: [0x10,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x18, x19          // encoding: [0x12,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x20, x21          // encoding: [0x14,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x22, x23          // encoding: [0x16,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x24, x25          // encoding: [0x18,0x21,0x5c,0xd5]
-# WITHOUT: msrr	S3_4_C2_C1_0, x26, x27          // encoding: [0x1a,0x21,0x5c,0xd5]
-
-# W_FEATS: msrr	TTBR0_EL1, x0, x1               // encoding: [0x00,0x20,0x58,0xd5]
-# W_FEATS: msrr	TTBR1_EL1, x0, x1               // encoding: [0x20,0x20,0x58,0xd5]
-# W_FEATS: msrr	PAR_EL1, x0, x1                 // encoding: [0x00,0x74,0x58,0xd5]
-# W_FEATS: msrr	RCWSMASK_EL1, x0, x1            // encoding: [0x60,0xd0,0x58,0xd5]
-# W_FEATS: msrr	RCWMASK_EL1, x0, x1             // encoding: [0xc0,0xd0,0x58,0xd5]
-# W_FEATS: msrr	TTBR0_EL2, x0, x1               // encoding: [0x00,0x20,0x5c,0xd5]
-# W_FEATS: msrr	TTBR1_EL2, x0, x1               // encoding: [0x20,0x20,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x0, x1               // encoding: [0x00,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x0, x1               // encoding: [0x00,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x2, x3               // encoding: [0x02,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x4, x5               // encoding: [0x04,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x6, x7               // encoding: [0x06,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x8, x9               // encoding: [0x08,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x10, x11             // encoding: [0x0a,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x12, x13             // encoding: [0x0c,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x14, x15             // encoding: [0x0e,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x16, x17             // encoding: [0x10,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x18, x19             // encoding: [0x12,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x20, x21             // encoding: [0x14,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x22, x23             // encoding: [0x16,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x24, x25             // encoding: [0x18,0x21,0x5c,0xd5]
-# W_FEATS: msrr	VTTBR_EL2, x26, x27             // encoding: [0x1a,0x21,0x5c,0xd5]
-
-# ERROR-NO-D128: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.4a-chk.txt b/llvm/test/MC/Disassembler/AArch64/armv9.4a-chk.txt
deleted file mode 100644
index 730f444..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.4a-chk.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8.9a -disassemble %s 2> %t | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v9.4a -disassemble %s 2> %t | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+chk   -disassemble %s 2> %t | FileCheck %s
-# RUN: llvm-mc -triple=aarch64 -mattr=+v8a   -disassemble %s 2> %t | FileCheck %s --check-prefix=NO-CHK
-
-[0x1f,0x25,0x03,0xd5]
-// CHECK: chkfeat x16
-// NO-CHK: hint #40
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.4a-ebep.txt b/llvm/test/MC/Disassembler/AArch64/armv9.4a-ebep.txt
deleted file mode 100644
index aa9c95f..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.4a-ebep.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble %s | FileCheck %s
-
-[0x23,0x43,0x38,0xd5]
-# CHECK:  mrs x3, PM
-
-[0x26,0x43,0x18,0xd5]
-# CHECK:  msr PM, x6
-
-[0x1f,0x42,0x01,0xd5]
-# CHECK:  msr PM, #0
-
-[0x1f,0x43,0x01,0xd5]
-# CHECK:  msr PM, #1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt b/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt
deleted file mode 100644
index 512f402..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -mattr +gcs -disassemble %s 2> %t | FileCheck %s
-
-[0x00,0x25,0x18,0xd5]
-[0x01,0x25,0x38,0xd5]
-// CHECK: msr     GCSCR_EL1, x0
-// CHECK: mrs     x1, GCSCR_EL1
-
-[0x22,0x25,0x18,0xd5]
-[0x23,0x25,0x38,0xd5]
-// CHECK: msr     GCSPR_EL1, x2
-// CHECK: mrs     x3, GCSPR_EL1
-
-[0x44,0x25,0x18,0xd5]
-[0x45,0x25,0x38,0xd5]
-// CHECK: msr     GCSCRE0_EL1, x4
-// CHECK: mrs     x5, GCSCRE0_EL1
-
-[0x26,0x25,0x1b,0xd5]
-[0x27,0x25,0x3b,0xd5]
-// CHECK: msr     GCSPR_EL0, x6
-// CHECK: mrs     x7, GCSPR_EL0
-
-[0x0a,0x25,0x1c,0xd5]
-[0x0b,0x25,0x3c,0xd5]
-// CHECK: msr     GCSCR_EL2, x10
-// CHECK: mrs     x11, GCSCR_EL2
-
-[0x2c,0x25,0x1c,0xd5]
-[0x2d,0x25,0x3c,0xd5]
-// CHECK: msr     GCSPR_EL2, x12
-// CHECK: mrs     x13, GCSPR_EL2
-
-[0x0e,0x25,0x1d,0xd5]
-[0x0f,0x25,0x3d,0xd5]
-// CHECK: msr     GCSCR_EL12, x14
-// CHECK: mrs     x15, GCSCR_EL12
-
-[0x30,0x25,0x1d,0xd5]
-[0x31,0x25,0x3d,0xd5]
-// CHECK: msr     GCSPR_EL12, x16
-// CHECK: mrs     x17, GCSPR_EL12
-
-[0x12,0x25,0x1e,0xd5]
-[0x13,0x25,0x3e,0xd5]
-// CHECK: msr     GCSCR_EL3, x18
-// CHECK: mrs     x19, GCSCR_EL3
-
-[0x34,0x25,0x1e,0xd5]
-[0x35,0x25,0x3e,0xd5]
-// CHECK: msr     GCSPR_EL3, x20
-// CHECK: mrs     x21, GCSPR_EL3
-
-[0x55,0x77,0x0b,0xd5]
-// CHECK: gcsss1 x21
-
-[0x76,0x77,0x2b,0xd5]
-// CHECK: gcsss2    x22
-
-[0x19,0x77,0x0b,0xd5]
-// CHECK: gcspushm x25
-
-[0x3f,0x77,0x2b,0xd5]
-// CHECK: gcspopm
-
-[0x39,0x77,0x2b,0xd5]
-// CHECK: gcspopm    x25
-
-[0x7f,0x22,0x03,0xd5]
-// CHECK: gcsb    dsync
-
-[0x7a,0x0f,0x1f,0xd9]
-// CHECK: gcsstr   x26, [x27]
-
-[0xfa,0x0f,0x1f,0xd9]
-// CHECK: gcsstr   x26, [sp]
-
-[0x7a,0x1f,0x1f,0xd9]
-// CHECK: gcssttr  x26, [x27]
-
-[0xfa,0x1f,0x1f,0xd9]
-// CHECK: gcssttr  x26, [sp]
-
-[0x9f,0x77,0x08,0xd5]
-// CHECK: gcspushx
-
-[0xbf,0x77,0x08,0xd5]
-// CHECK: gcspopcx
-
-[0xdf,0x77,0x08,0xd5]
-// CHECK: gcspopx
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.4a-lse128.txt b/llvm/test/MC/Disassembler/AArch64/armv9.4a-lse128.txt
deleted file mode 100644
index d4dffa0..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.4a-lse128.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -mattr=+lse128 -disassemble %s | FileCheck %s
-# RUN: not llvm-mc -triple=aarch64 -disassemble %s 2>&1 | FileCheck --check-prefix=NO-LSE128 %s
-
-[0x61,0x11,0x22,0x19]
-# CHECK: ldclrp x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x13,0x36,0x19]
-# CHECK: ldclrp x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x11,0xa2,0x19]
-# CHECK: ldclrpa x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x13,0xb6,0x19]
-# CHECK: ldclrpa x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x11,0xe2,0x19]
-# CHECK: ldclrpal x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x13,0xf6,0x19]
-# CHECK: ldclrpal x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x11,0x62,0x19]
-# CHECK: ldclrpl x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x13,0x76,0x19]
-# CHECK: ldclrpl x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x31,0x22,0x19]
-# CHECK: ldsetp x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x33,0x36,0x19]
-# CHECK: ldsetp x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x31,0xa2,0x19]
-# CHECK: ldsetpa x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x33,0xb6,0x19]
-# CHECK: ldsetpa x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x31,0xe2,0x19]
-# CHECK: ldsetpal x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x33,0xf6,0x19]
-# CHECK: ldsetpal x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x31,0x62,0x19]
-# CHECK: ldsetpl x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x33,0x76,0x19]
-# CHECK: ldsetpl x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x81,0x22,0x19]
-# CHECK: swpp x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x83,0x36,0x19]
-# CHECK: swpp x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x81,0xa2,0x19]
-# CHECK: swppa x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x83,0xb6,0x19]
-# CHECK: swppa x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x81,0xe2,0x19]
-# CHECK: swppal x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x83,0xf6,0x19]
-# CHECK: swppal x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0x61,0x81,0x62,0x19]
-# CHECK: swppl x1, x2, [x11]
-# NO-LSE128: warning: invalid instruction encoding
-
-[0xf5,0x83,0x76,0x19]
-# CHECK: swppl x21, x22, [sp]
-# NO-LSE128: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-cpa.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-cpa.txt
deleted file mode 100644
index bf61782..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-cpa.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+cpa < %s | FileCheck %s
-# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck --check-prefix=NO-CPA %s
-
-[0x20,0x20,0x02,0x9a]
-# CHECK: addpt x0, x1, x2
-# NO-CPA: warning: invalid instruction encoding
-
-[0xff,0x23,0x02,0x9a]
-# CHECK: addpt sp, sp, x2
-# NO-CPA: warning: invalid instruction encoding
-
-[0x20,0x3c,0x02,0x9a]
-# CHECK: addpt x0, x1, x2, lsl #7
-# NO-CPA: warning: invalid instruction encoding
-
-[0xff,0x3f,0x02,0x9a]
-# CHECK: addpt sp, sp, x2, lsl #7
-# NO-CPA: warning: invalid instruction encoding
-
-[0x20,0x20,0x02,0xda]
-# CHECK: subpt x0, x1, x2
-# NO-CPA: warning: invalid instruction encoding
-
-[0xff,0x23,0x02,0xda]
-# CHECK: subpt sp, sp, x2
-# NO-CPA: warning: invalid instruction encoding
-
-[0x20,0x3c,0x02,0xda]
-# CHECK: subpt x0, x1, x2, lsl #7
-# NO-CPA: warning: invalid instruction encoding
-
-[0xff,0x3f,0x02,0xda]
-# CHECK: subpt sp, sp, x2, lsl #7
-# NO-CPA: warning: invalid instruction encoding
-
-[0x20,0x0c,0x62,0x9b]
-# CHECK: maddpt x0, x1, x2, x3
-# NO-CPA: warning: invalid instruction encoding
-
-[0x20,0x8c,0x62,0x9b]
-# CHECK: msubpt x0, x1, x2, x3
-# NO-CPA: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
deleted file mode 100644
index d2476db..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x20,0xc1,0x3e,0xd5]
-# CHECK: mrs x0, VDISR_EL3
-
-[0x20,0xc1,0x1e,0xd5]
-# CHECK: msr VDISR_EL3, x0
-
-[0x60,0x52,0x3e,0xd5]
-# CHECK: mrs x0, VSESR_EL3
-
-[0x60,0x52,0x1e,0xd5]
-# CHECK: msr VSESR_EL3, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
deleted file mode 100644
index f7e355a..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0xa0,0x11,0x3e,0xd5]
-# CHECK: mrs x0, FGWTE3_EL3
-
-[0xa0,0x11,0x1e,0xd5]
-# CHECK: msr FGWTE3_EL3, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
deleted file mode 100644
index d9be7e5..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x80,0x23,0x3c,0xd5]
-# CHECK: mrs x0, HACDBSBR_EL2
-
-[0x80,0x23,0x1c,0xd5]
-# CHECK: msr HACDBSBR_EL2, x0
-
-[0xa0,0x23,0x3c,0xd5]
-# CHECK: mrs x0, HACDBSCONS_EL2
-
-[0xa0,0x23,0x1c,0xd5]
-# CHECK: msr HACDBSCONS_EL2, x0
-
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
deleted file mode 100644
index 999f3225..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x40,0x23,0x3c,0xd5]
-# CHECK: mrs x0, HDBSSBR_EL2
-
-[0x40,0x23,0x1c,0xd5]
-# CHECK: msr HDBSSBR_EL2, x0
-
-[0x60,0x23,0x3c,0xd5]
-# CHECK: mrs x0, HDBSSPROD_EL2
-
-[0x60,0x23,0x1c,0xd5]
-# CHECK: msr HDBSSPROD_EL2, x0
-
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-spmu2.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-spmu2.txt
deleted file mode 100644
index 9d4fa1b..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-spmu2.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x80,0x9c,0x13,0xd5]
-# CHECK: msr SPMZR_EL0, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-step2.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-step2.txt
deleted file mode 100644
index 473c16d..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-step2.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x40,0x05,0x30,0xd5]
-# CHECK: mrs x0, MDSTEPOP_EL1
-
-[0x40,0x05,0x10,0xd5]
-# CHECK: msr MDSTEPOP_EL1, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
deleted file mode 100644
index df5e894..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
-# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck --check-prefix=CHECK-NO-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
-
-[0x5f,0x86,0x0c,0xd5]
-# CHECK-TLBIW: tlbi vmallws2e1
-# CHECK-NO-TLBIW: sys #4, c8, c6, #2
-
-[0x5f,0x82,0x0c,0xd5]
-# CHECK-TLBIW: tlbi vmallws2e1is
-# CHECK-NO-TLBIW: sys #4, c8, c2, #2
-
-[0x5f,0x85,0x0c,0xd5]
-# CHECK-TLBIW: tlbi vmallws2e1os
-# CHECK-NO-TLBIW: sys #4, c8, c5, #2
-
-[0x5f,0x96,0x0c,0xd5]
-# CHECK-XS: tlbi vmallws2e1nxs
-# CHECK-NO-XS-TLBIW: sys #4, c9, c6, #2
-
-[0x5f,0x92,0x0c,0xd5]
-# CHECK-XS: tlbi vmallws2e1isnxs
-# CHECK-NO-XS-TLBIW: sys #4, c9, c2, #2
-
-[0x5f,0x95,0x0c,0xd5]
-# CHECK-XS: tlbi vmallws2e1osnxs
-# CHECK-NO-XS-TLBIW: sys #4, c9, c5, #2
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-lsui.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-lsui.txt
deleted file mode 100644
index dc53a0b..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-lsui.txt
+++ /dev/null
@@ -1,323 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mc -triple aarch64 -mattr=+lsui -disassemble %s  | FileCheck %s
-
-# LDTXR and STTXR
-[0xe9,0x7f,0x5f,0xc9]
-[0xe9,0x7f,0x5f,0xc9]
-[0x6a,0x7d,0x5f,0xc9]
-[0x6a,0x7d,0x5f,0xc9]
-
-[0xe4,0x7f,0x1f,0x89]
-[0xe4,0x7f,0x1f,0x89]
-[0xe6,0x7c,0x05,0xc9]
-[0xe6,0x7c,0x05,0xc9]
-
-# LDATXR and STLTXR
-[0xe9,0xff,0x5f,0xc9]
-[0x6a,0xfd,0x5f,0xc9]
-
-[0xe4,0xff,0x02,0x89]
-[0xe6,0xfc,0x05,0xc9]
-
-# STTP and LDTP
-[0x55,0xf4,0x5f,0xe9]
-[0x76,0x5c,0x60,0xe9]
-[0x98,0xe4,0x40,0xe9]
-
-[0xe3,0x17,0x81,0xe8]
-[0xe3,0x97,0x80,0xe9]
-[0xe3,0x17,0x00,0xed]
-[0xf1,0xcf,0x1f,0xed]
-
-[0x55,0xf4,0xdf,0xe8]
-[0x76,0x5c,0xe0,0xe8]
-[0x98,0xe4,0xc0,0xe8]
-
-[0xe3,0x17,0x80,0xec]
-[0xf1,0xcf,0x9f,0xec]
-[0x37,0x74,0xe0,0xec]
-
-[0x55,0xf4,0xdf,0xe9]
-[0x76,0x5c,0xe0,0xe9]
-[0x98,0xe4,0xc0,0xe9]
-
-[0xe3,0x17,0x80,0xed]
-[0xf1,0xcf,0x9f,0xed]
-[0x37,0x74,0xe0,0xed]
-
-[0x55,0xf4,0x5f,0xe8]
-[0x76,0x5c,0x60,0xe8]
-[0x98,0xe4,0x40,0xe8]
-[0x37,0x74,0x60,0xec]
-
-[0xe3,0x17,0x00,0xe8]
-[0xf1,0x4f,0x04,0xe8]
-[0xe3,0x17,0x00,0xec]
-[0xf1,0xcf,0x1f,0xec]
-
-# SWPT{A|L}
-[0xbf,0x84,0x27,0x19]
-[0xff,0x87,0x29,0x59]
-
-[0xbf,0x84,0xa7,0x19]
-[0xff,0x87,0xa9,0x59]
-
-[0xbf,0x84,0x67,0x19]
-[0xff,0x87,0x69,0x59]
-
-[0xbf,0x84,0xe7,0x19]
-[0xff,0x87,0xe9,0x59]
-
-# CAS{A|L}T
-[0x41,0x7c,0x80,0xc9]
-[0xe1,0x7f,0x80,0xc9]
-[0x41,0x7c,0xc0,0xc9]
-[0xe1,0x7f,0xc0,0xc9]
-[0x41,0xfc,0xc0,0xc9]
-[0xe1,0xff,0xc0,0xc9]
-[0x41,0xfc,0x80,0xc9]
-[0xe1,0xff,0x80,0xc9]
-
-# CASP{A|L}T
-[0x82,0x7c,0x80,0x49]
-[0xe2,0x7f,0x80,0x49]
-[0x82,0x7c,0xc0,0x49]
-[0xe2,0x7f,0xc0,0x49]
-[0x82,0xfc,0x80,0x49]
-[0xe2,0xff,0x80,0x49]
-[0x82,0xfc,0xc0,0x49]
-[0xe2,0xff,0xc0,0x49]
-
-#LDT{SET|ADD|CLR}{A|L} and STT{ADD|SET|CLR}{L}
-
-[0xbf,0x04,0x27,0x19]
-[0xff,0x07,0x29,0x59]
-
-[0xbf,0x04,0xa7,0x19]
-[0xff,0x07,0xa9,0x59]
-
-[0xbf,0x04,0x67,0x19]
-[0xff,0x07,0x69,0x59]
-
-[0xbf,0x04,0xe7,0x19]
-[0xff,0x07,0xe9,0x59]
-
-[0xbf,0x14,0x27,0x19]
-[0xff,0x17,0x29,0x59]
-
-[0xbf,0x14,0x67,0x19]
-[0xff,0x17,0x69,0x59]
-
-[0xbf,0x14,0xa7,0x19]
-[0xff,0x17,0xa9,0x59]
-
-[0xbf,0x14,0xe7,0x19]
-[0xff,0x17,0xe9,0x59]
-
-[0xbf,0x34,0x27,0x19]
-[0xff,0x37,0x29,0x59]
-
-[0xbf,0x34,0x67,0x19]
-[0xff,0x37,0x69,0x59]
-
-[0xbf,0x34,0xa7,0x19]
-[0xff,0x37,0xa9,0x59]
-
-[0xbf,0x34,0xe7,0x19]
-[0xff,0x37,0xe9,0x59]
-
-[0x5f,0x04,0x20,0x19]
-[0xff,0x07,0x22,0x19]
-[0x5f,0x04,0x20,0x59]
-[0xff,0x07,0x22,0x59]
-
-[0x5f,0x04,0x20,0x19]
-[0xff,0x07,0x22,0x19]
-[0x5f,0x04,0x20,0x59]
-[0xff,0x07,0x22,0x59]
-
-[0x5f,0x04,0x20,0x19]
-[0xff,0x07,0x22,0x19]
-[0x5f,0x04,0x20,0x59]
-[0xff,0x07,0x22,0x59]
-
-[0x5f,0x04,0x20,0x19]
-[0xff,0x07,0x22,0x19]
-[0x5f,0x04,0x20,0x59]
-[0xff,0x07,0x22,0x59]
-
-[0x5f,0x14,0x20,0x19]
-[0xff,0x17,0x22,0x19]
-[0x5f,0x14,0x20,0x59]
-[0xff,0x17,0x22,0x59]
-
-[0x5f,0x14,0x20,0x19]
-[0xff,0x17,0x22,0x19]
-[0x5f,0x14,0x20,0x59]
-[0xff,0x17,0x22,0x59]
-
-[0x5f,0x14,0x20,0x19]
-[0xff,0x17,0x22,0x19]
-[0x5f,0x14,0x20,0x59]
-[0xff,0x17,0x22,0x59]
-
-[0x5f,0x14,0x20,0x19]
-[0xff,0x17,0x22,0x59]
-[0x5f,0x14,0x20,0x59]
-[0xff,0x17,0x22,0x59]
-
-[0x5f,0x34,0x20,0x19]
-[0xff,0x37,0x22,0x19]
-[0x5f,0x34,0x20,0x59]
-[0xff,0x37,0x22,0x59]
-
-[0x5f,0x34,0x20,0x19]
-[0xff,0x37,0x22,0x19]
-[0x5f,0x34,0x20,0x59]
-[0xff,0x37,0x22,0x59]
-
-[0x5f,0x34,0x20,0x19]
-[0xff,0x37,0x22,0x19]
-[0x5f,0x34,0x20,0x59]
-[0xff,0x37,0x22,0x59]
-
-[0x5f,0x34,0x20,0x19]
-[0xff,0x37,0x22,0x59]
-[0x5f,0x34,0x20,0x59]
-[0xff,0x37,0x22,0x59]
-
-# CHECK:      	ldtxr	x9, [sp]
-# CHECK-NEXT: 	ldtxr	x9, [sp]
-# CHECK-NEXT: 	ldtxr	x10, [x11]
-# CHECK-NEXT: 	ldtxr	x10, [x11]
-# CHECK-NEXT: 	sttxr	wzr, w4, [sp]
-# CHECK-NEXT: 	sttxr	wzr, w4, [sp]
-# CHECK-NEXT: 	sttxr	w5, x6, [x7]
-# CHECK-NEXT: 	sttxr	w5, x6, [x7]
-# CHECK-NEXT: 	ldatxr	x9, [sp]
-# CHECK-NEXT: 	ldatxr	x10, [x11]
-# CHECK-NEXT: 	stltxr	w2, w4, [sp]
-# CHECK-NEXT: 	stltxr	w5, x6, [x7]
-# CHECK-NEXT: 	ldtp	x21, x29, [x2, #504]
-# CHECK-NEXT: 	ldtp	x22, x23, [x3, #-512]
-# CHECK-NEXT: 	ldtp	x24, x25, [x4, #8]
-# CHECK-NEXT:	sttp	x3, x5, [sp], #16
-# CHECK-NEXT:	sttp	x3, x5, [sp, #8]!
-# CHECK-NEXT: 	sttp	q3, q5, [sp]
-# CHECK-NEXT: 	sttp	q17, q19, [sp, #1008]
-# CHECK-NEXT: 	ldtp	x21, x29, [x2], #504
-# CHECK-NEXT: 	ldtp	x22, x23, [x3], #-512
-# CHECK-NEXT: 	ldtp	x24, x25, [x4], #8
-# CHECK-NEXT: 	sttp	q3, q5, [sp], #0
-# CHECK-NEXT: 	sttp	q17, q19, [sp], #1008
-# CHECK-NEXT: 	ldtp	q23, q29, [x1], #-1024
-# CHECK-NEXT: 	ldtp	x21, x29, [x2, #504]!
-# CHECK-NEXT: 	ldtp	x22, x23, [x3, #-512]!
-# CHECK-NEXT: 	ldtp	x24, x25, [x4, #8]!
-# CHECK-NEXT: 	sttp	q3, q5, [sp, #0]!
-# CHECK-NEXT: 	sttp	q17, q19, [sp, #1008]!
-# CHECK-NEXT: 	ldtp	q23, q29, [x1, #-1024]!
-# CHECK-NEXT: 	ldtnp	x21, x29, [x2, #504]
-# CHECK-NEXT: 	ldtnp	x22, x23, [x3, #-512]
-# CHECK-NEXT: 	ldtnp	x24, x25, [x4, #8]
-# CHECK-NEXT: 	ldtnp	q23, q29, [x1, #-1024]
-# CHECK-NEXT: 	sttnp	x3, x5, [sp]
-# CHECK-NEXT: 	sttnp	x17, x19, [sp, #64]
-# CHECK-NEXT: 	sttnp	q3, q5, [sp]
-# CHECK-NEXT: 	sttnp	q17, q19, [sp, #1008]
-# CHECK-NEXT: 	swpt	w7, wzr, [x5]
-# CHECK-NEXT: 	swpt	x9, xzr, [sp]
-# CHECK-NEXT: 	swpta	w7, wzr, [x5]
-# CHECK-NEXT: 	swpta	x9, xzr, [sp]
-# CHECK-NEXT: 	swptl	w7, wzr, [x5]
-# CHECK-NEXT: 	swptl	x9, xzr, [sp]
-# CHECK-NEXT: 	swptal	w7, wzr, [x5]
-# CHECK-NEXT: 	swptal	x9, xzr, [sp]
-# CHECK-NEXT: 	cast	x0, x1, [x2]
-# CHECK-NEXT: 	cast	x0, x1, [sp]
-# CHECK-NEXT: 	casat	x0, x1, [x2]
-# CHECK-NEXT: 	casat	x0, x1, [sp]
-# CHECK-NEXT: 	casalt	x0, x1, [x2]
-# CHECK-NEXT: 	casalt	x0, x1, [sp]
-# CHECK-NEXT: 	caslt	x0, x1, [x2]
-# CHECK-NEXT: 	caslt	x0, x1, [sp]
-# CHECK-NEXT: 	caspt	x0, x1, x2, x3, [x4]
-# CHECK-NEXT: 	caspt	x0, x1, x2, x3, [sp]
-# CHECK-NEXT: 	caspat	x0, x1, x2, x3, [x4]
-# CHECK-NEXT: 	caspat	x0, x1, x2, x3, [sp]
-# CHECK-NEXT: 	casplt	x0, x1, x2, x3, [x4]
-# CHECK-NEXT: 	casplt	x0, x1, x2, x3, [sp]
-# CHECK-NEXT: 	caspalt	x0, x1, x2, x3, [x4]
-# CHECK-NEXT: 	caspalt	x0, x1, x2, x3, [sp]
-# CHECK-NEXT: 	sttadd	w7, [x5]
-# CHECK-NEXT: 	sttadd	x9, [sp]
-# CHECK-NEXT: 	ldtadda	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtadda	x9, xzr, [sp]
-# CHECK-NEXT: 	sttaddl	w7, [x5]
-# CHECK-NEXT: 	sttaddl	x9, [sp]
-# CHECK-NEXT: 	ldtaddal	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtaddal	x9, xzr, [sp]
-# CHECK-NEXT: 	sttclr	w7, [x5]
-# CHECK-NEXT: 	sttclr	x9, [sp]
-# CHECK-NEXT: 	sttclrl	w7, [x5]
-# CHECK-NEXT: 	sttclrl	x9, [sp]
-# CHECK-NEXT: 	ldtclra	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtclra	x9, xzr, [sp]
-# CHECK-NEXT: 	ldtclral	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtclral	x9, xzr, [sp]
-# CHECK-NEXT: 	sttset	w7, [x5]
-# CHECK-NEXT: 	sttset	x9, [sp]
-# CHECK-NEXT: 	sttsetl	w7, [x5]
-# CHECK-NEXT: 	sttsetl	x9, [sp]
-# CHECK-NEXT: 	ldtseta	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtseta	x9, xzr, [sp]
-# CHECK-NEXT: 	ldtsetal	w7, wzr, [x5]
-# CHECK-NEXT: 	ldtsetal	x9, xzr, [sp]
-# CHECK-NEXT: 	sttadd	w0, [x2]
-# CHECK-NEXT: 	sttadd	w2, [sp]
-# CHECK-NEXT: 	sttadd	x0, [x2]
-# CHECK-NEXT: 	sttadd	x2, [sp]
-# CHECK-NEXT: 	sttadd	w0, [x2]
-# CHECK-NEXT: 	sttadd	w2, [sp]
-# CHECK-NEXT: 	sttadd	x0, [x2]
-# CHECK-NEXT: 	sttadd	x2, [sp]
-# CHECK-NEXT: 	sttadd	w0, [x2]
-# CHECK-NEXT: 	sttadd	w2, [sp]
-# CHECK-NEXT: 	sttadd	x0, [x2]
-# CHECK-NEXT: 	sttadd	x2, [sp]
-# CHECK-NEXT: 	sttadd	w0, [x2]
-# CHECK-NEXT: 	sttadd	w2, [sp]
-# CHECK-NEXT: 	sttadd	x0, [x2]
-# CHECK-NEXT: 	sttadd	x2, [sp]
-# CHECK-NEXT: 	sttclr	w0, [x2]
-# CHECK-NEXT: 	sttclr	w2, [sp]
-# CHECK-NEXT: 	sttclr	x0, [x2]
-# CHECK-NEXT: 	sttclr	x2, [sp]
-# CHECK-NEXT: 	sttclr	w0, [x2]
-# CHECK-NEXT: 	sttclr	w2, [sp]
-# CHECK-NEXT: 	sttclr	x0, [x2]
-# CHECK-NEXT: 	sttclr	x2, [sp]
-# CHECK-NEXT: 	sttclr	w0, [x2]
-# CHECK-NEXT: 	sttclr	w2, [sp]
-# CHECK-NEXT: 	sttclr	x0, [x2]
-# CHECK-NEXT: 	sttclr	x2, [sp]
-# CHECK-NEXT: 	sttclr	w0, [x2]
-# CHECK-NEXT: 	sttclr	x2, [sp]
-# CHECK-NEXT: 	sttclr	x0, [x2]
-# CHECK-NEXT: 	sttclr	x2, [sp]
-# CHECK-NEXT: 	sttset	w0, [x2]
-# CHECK-NEXT: 	sttset	w2, [sp]
-# CHECK-NEXT: 	sttset	x0, [x2]
-# CHECK-NEXT: 	sttset	x2, [sp]
-# CHECK-NEXT: 	sttset	w0, [x2]
-# CHECK-NEXT: 	sttset	w2, [sp]
-# CHECK-NEXT: 	sttset	x0, [x2]
-# CHECK-NEXT: 	sttset	x2, [sp]
-# CHECK-NEXT: 	sttset	w0, [x2]
-# CHECK-NEXT: 	sttset	w2, [sp]
-# CHECK-NEXT: 	sttset	x0, [x2]
-# CHECK-NEXT: 	sttset	x2, [sp]
-# CHECK-NEXT: 	sttset	w0, [x2]
-# CHECK-NEXT: 	sttset	x2, [sp]
-# CHECK-NEXT: 	sttset	x0, [x2]
-# CHECK-NEXT: 	sttset	x2, [sp]
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-mpam.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-mpam.txt
deleted file mode 100644
index b9ff0a4..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-mpam.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-#------------------------------------------------------------------------------
-# Armv9.6-A FEAT_MPAM Extensions
-#------------------------------------------------------------------------------
-
-[0x80,0xa5,0x1e,0xd5]
-# CHECK: msr MPAMBW3_EL3, x0
-
-[0x80,0xa5,0x1c,0xd5]
-# CHECK: msr MPAMBW2_EL2, x0
-
-[0x80,0xa5,0x18,0xd5]
-# CHECK: msr MPAMBW1_EL1, x0
-
-[0x80,0xa5,0x1d,0xd5]
-# CHECK: msr MPAMBW1_EL12, x0
-
-[0xa0,0xa5,0x18,0xd5]
-# CHECK: msr MPAMBW0_EL1, x0
-
-[0xc0,0xa5,0x1c,0xd5]
-# CHECK: msr MPAMBWCAP_EL2, x0
-
-[0xe0,0xa5,0x18,0xd5]
-# CHECK: msr MPAMBWSM_EL1, x0
-
-[0xa0,0xa4,0x38,0xd5]
-# CHECK: mrs x0, MPAMBWIDR_EL1
-
-[0x80,0xa5,0x3e,0xd5]
-# CHECK: mrs x0, MPAMBW3_EL3
-
-[0x80,0xa5,0x3c,0xd5]
-# CHECK: mrs x0, MPAMBW2_EL2
-
-[0x80,0xa5,0x38,0xd5]
-# CHECK: mrs x0, MPAMBW1_EL1
-
-[0x80,0xa5,0x3d,0xd5]
-# CHECK: mrs x0, MPAMBW1_EL12
-
-[0xa0,0xa5,0x38,0xd5]
-# CHECK: mrs x0, MPAMBW0_EL1
-
-[0xc0,0xa5,0x3c,0xd5]
-# CHECK: mrs x0, MPAMBWCAP_EL2
-
-[0xe0,0xa5,0x38,0xd5]
-# CHECK: mrs x0, MPAMBWSM_EL1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-occmo.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-occmo.txt
deleted file mode 100644
index 5c3b57a..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-occmo.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+occmo -mattr=+mte -disassemble < %s | FileCheck %s
-[0x0c,0x7f,0x0b,0xd5]
-[0xe0,0x7f,0x0b,0xd5]
-[0x0d,0x7b,0x0b,0xd5]
-[0xe1,0x7b,0x0b,0xd5]
-
-# CHECK:      	dc	civaoc, x12
-# CHECK-NEXT: 	dc	cigdvaoc, x0
-# CHECK-NEXT: 	dc	cvaoc, x13
-# CHECK-NEXT: 	dc	cgdvaoc, x1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-pcdphint.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-pcdphint.txt
deleted file mode 100644
index 3855ce0..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-pcdphint.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+pcdphint %s | FileCheck %s
-
-[0x1f,0x96,0x01,0xd5]
-[0x3f,0x96,0x01,0xd5]
-
-# CHECK:      	stshh	keep
-# CHECK-NEXT: 	stshh	strm
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
deleted file mode 100644
index 75129ac..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mc -triple aarch64 -disassemble %s  | FileCheck %s
-
-[0x00,0x70,0x0e,0xd5]
-[0x01,0x70,0x0e,0xd5]
-[0x02,0x70,0x0e,0xd5]
-[0x11,0x70,0x0e,0xd5]
-[0x1e,0x70,0x0e,0xd5]
-[0xa3,0x21,0x3e,0xd5]
-[0xa4,0x21,0x1e,0xd5]
-
-# CHECK:      	apas x0
-# CHECK-NEXT: 	apas x1
-# CHECK-NEXT: 	apas x2
-# CHECK-NEXT: 	apas x17
-# CHECK-NEXT: 	apas x30
-# CHECK-NEXT: 	mrs	x3, GPCBW_EL3
-# CHECK-NEXT: 	msr	GPCBW_EL3, x4
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-srmask.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-srmask.txt
deleted file mode 100644
index 30d0a60..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-srmask.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mc -triple aarch64 -disassemble < %s 2> %t | FileCheck %s
-
-[0x03,0x14,0x38,0xd5]
-[0x03,0x14,0x3c,0xd5]
-[0x03,0x14,0x3d,0xd5]
-[0x43,0x14,0x38,0xd5]
-[0x43,0x14,0x3c,0xd5]
-[0x43,0x14,0x3d,0xd5]
-[0x63,0x14,0x38,0xd5]
-[0x63,0x14,0x3c,0xd5]
-[0x63,0x14,0x3d,0xd5]
-[0x83,0x14,0x38,0xd5]
-[0xc3,0x14,0x38,0xd5]
-[0xe3,0x14,0x38,0xd5]
-[0x43,0x27,0x38,0xd5]
-[0x43,0x27,0x3c,0xd5]
-[0x43,0x27,0x3d,0xd5]
-[0x63,0x27,0x38,0xd5]
-[0x63,0x27,0x3c,0xd5]
-[0x63,0x27,0x3d,0xd5]
-[0xc3,0x27,0x38,0xd5]
-[0xe3,0x27,0x38,0xd5]
-[0x23,0x14,0x38,0xd5]
-[0x23,0x14,0x3c,0xd5]
-[0x23,0x14,0x3d,0xd5]
-[0xa3,0x14,0x38,0xd5]
-
-[0x03,0x14,0x18,0xd5]
-[0x03,0x14,0x1c,0xd5]
-[0x03,0x14,0x1d,0xd5]
-[0x43,0x14,0x18,0xd5]
-[0x43,0x14,0x1c,0xd5]
-[0x43,0x14,0x1d,0xd5]
-[0x63,0x14,0x18,0xd5]
-[0x63,0x14,0x1c,0xd5]
-[0x63,0x14,0x1d,0xd5]
-[0x83,0x14,0x18,0xd5]
-[0xc3,0x14,0x18,0xd5]
-[0xe3,0x14,0x18,0xd5]
-[0x43,0x27,0x18,0xd5]
-[0x43,0x27,0x1c,0xd5]
-[0x43,0x27,0x1d,0xd5]
-[0x63,0x27,0x18,0xd5]
-[0x63,0x27,0x1c,0xd5]
-[0x63,0x27,0x1d,0xd5]
-[0xc3,0x27,0x18,0xd5]
-[0xe3,0x27,0x18,0xd5]
-[0x23,0x14,0x18,0xd5]
-[0x23,0x14,0x1c,0xd5]
-[0x23,0x14,0x1d,0xd5]
-[0xa3,0x14,0x18,0xd5]
-
-# CHECK:      	mrs	x3, SCTLRMASK_EL1
-# CHECK-NEXT: 	mrs	x3, SCTLRMASK_EL2
-# CHECK-NEXT: 	mrs	x3, SCTLRMASK_EL12
-# CHECK-NEXT: 	mrs	x3, CPACRMASK_EL1
-# CHECK-NEXT: 	mrs	x3, CPTRMASK_EL2
-# CHECK-NEXT: 	mrs	x3, CPACRMASK_EL12
-# CHECK-NEXT: 	mrs	x3, SCTLR2MASK_EL1
-# CHECK-NEXT: 	mrs	x3, SCTLR2MASK_EL2
-# CHECK-NEXT: 	mrs	x3, SCTLR2MASK_EL12
-# CHECK-NEXT: 	mrs	x3, CPACRALIAS_EL1
-# CHECK-NEXT: 	mrs	x3, SCTLRALIAS_EL1
-# CHECK-NEXT: 	mrs	x3, SCTLR2ALIAS_EL1
-# CHECK-NEXT: 	mrs	x3, TCRMASK_EL1
-# CHECK-NEXT: 	mrs	x3, TCRMASK_EL2
-# CHECK-NEXT: 	mrs	x3, TCRMASK_EL12
-# CHECK-NEXT: 	mrs	x3, TCR2MASK_EL1
-# CHECK-NEXT: 	mrs	x3, TCR2MASK_EL2
-# CHECK-NEXT: 	mrs	x3, TCR2MASK_EL12
-# CHECK-NEXT: 	mrs	x3, TCRALIAS_EL1
-# CHECK-NEXT: 	mrs	x3, TCR2ALIAS_EL1
-# CHECK-NEXT: 	mrs	x3, ACTLRMASK_EL1
-# CHECK-NEXT: 	mrs	x3, ACTLRMASK_EL2
-# CHECK-NEXT: 	mrs	x3, ACTLRMASK_EL12
-# CHECK-NEXT: 	mrs	x3, ACTLRALIAS_EL1
-# CHECK-NEXT: 	msr	SCTLRMASK_EL1, x3
-# CHECK-NEXT: 	msr	SCTLRMASK_EL2, x3
-# CHECK-NEXT: 	msr	SCTLRMASK_EL12, x3
-# CHECK-NEXT: 	msr	CPACRMASK_EL1, x3
-# CHECK-NEXT: 	msr	CPTRMASK_EL2, x3
-# CHECK-NEXT: 	msr	CPACRMASK_EL12, x3
-# CHECK-NEXT: 	msr	SCTLR2MASK_EL1, x3
-# CHECK-NEXT: 	msr	SCTLR2MASK_EL2, x3
-# CHECK-NEXT: 	msr	SCTLR2MASK_EL12, x3
-# CHECK-NEXT: 	msr	CPACRALIAS_EL1, x3
-# CHECK-NEXT: 	msr	SCTLRALIAS_EL1, x3
-# CHECK-NEXT: 	msr	SCTLR2ALIAS_EL1, x3
-# CHECK-NEXT: 	msr	TCRMASK_EL1, x3
-# CHECK-NEXT: 	msr	TCRMASK_EL2, x3
-# CHECK-NEXT: 	msr	TCRMASK_EL12, x3
-# CHECK-NEXT: 	msr	TCR2MASK_EL1, x3
-# CHECK-NEXT: 	msr	TCR2MASK_EL2, x3
-# CHECK-NEXT: 	msr	TCR2MASK_EL12, x3
-# CHECK-NEXT: 	msr	TCRALIAS_EL1, x3
-# CHECK-NEXT: 	msr	TCR2ALIAS_EL1, x3
-# CHECK-NEXT: 	msr	ACTLRMASK_EL1, x3
-# CHECK-NEXT: 	msr	ACTLRMASK_EL2, x3
-# CHECK-NEXT: 	msr	ACTLRMASK_EL12, x3
-# CHECK-NEXT: 	msr	ACTLRALIAS_EL1, x3
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-statistical-profiling.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-statistical-profiling.txt
deleted file mode 100644
index 446e2f0..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-statistical-profiling.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x60,0x9a,0x1d,0xd5]
-# CHECK: msr PMBSR_EL12, x0
-[0x60,0x9a,0x1c,0xd5]
-# CHECK: msr PMBSR_EL2, x0
-[0x60,0x9a,0x1e,0xd5]
-# CHECK: msr PMBSR_EL3, x0
-
-[0x60,0x9a,0x3d,0xd5]
-# CHECK: mrs x0, PMBSR_EL12
-[0x60,0x9a,0x3c,0xd5]
-# CHECK: mrs x0, PMBSR_EL2
-[0x60,0x9a,0x3e,0xd5]
-# CHECK: mrs x0, PMBSR_EL3
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-trbe-exception.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-trbe-exception.txt
deleted file mode 100644
index 4b39e10..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-trbe-exception.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
-
-[0x60,0x9b,0x1d,0xd5]
-# CHECK: msr TRBSR_EL12, x0
-[0x60,0x9b,0x1c,0xd5]
-# CHECK: msr TRBSR_EL2, x0
-[0x60,0x9b,0x1e,0xd5]
-# CHECK: msr TRBSR_EL3, x0
-
-[0x60,0x9b,0x3d,0xd5]
-# CHECK: mrs x0, TRBSR_EL12
-[0x60,0x9b,0x3c,0xd5]
-# CHECK: mrs x0, TRBSR_EL2
-[0x60,0x9b,0x3e,0xd5]
-# CHECK: mrs x0, TRBSR_EL3
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9a-mec.txt b/llvm/test/MC/Disassembler/AArch64/armv9a-mec.txt
deleted file mode 100644
index c5d931d..0000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9a-mec.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-# RUN: llvm-mc -triple=aarch64 -mattr=+mec -disassemble %s      | FileCheck %s
-# RUN: llvm-mc -triple=aarch64             -disassemble %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-MEC
-
-[0xe0,0xa8,0x3c,0xd5]
-# CHECK: mrs x0, MECIDR_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C8_7
-[0x00,0xa8,0x3c,0xd5]
-# CHECK: mrs x0, MECID_P0_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C8_0
-[0x20,0xa8,0x3c,0xd5]
-# CHECK: mrs x0, MECID_A0_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C8_1
-[0x40,0xa8,0x3c,0xd5]
-# CHECK: mrs x0, MECID_P1_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C8_2
-[0x60,0xa8,0x3c,0xd5]
-# CHECK: mrs x0, MECID_A1_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C8_3
-[0x00,0xa9,0x3c,0xd5]
-# CHECK: mrs x0, VMECID_P_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C9_0
-[0x20,0xa9,0x3c,0xd5]
-# CHECK: mrs x0, VMECID_A_EL2
-# CHECK-NO-MEC: mrs x0, S3_4_C10_C9_1
-[0x20,0xaa,0x3e,0xd5]
-# CHECK: mrs x0, MECID_RL_A_EL3
-# CHECK-NO-MEC: mrs x0, S3_6_C10_C10_1
-[0x00,0xa8,0x1c,0xd5]
-# CHECK: msr MECID_P0_EL2,    x0
-# CHECK-NO-MEC: msr S3_4_C10_C8_0, x0
-[0x20,0xa8,0x1c,0xd5]
-# CHECK: msr MECID_A0_EL2,    x0
-# CHECK-NO-MEC: msr S3_4_C10_C8_1, x0
-[0x40,0xa8,0x1c,0xd5]
-# CHECK: msr MECID_P1_EL2,    x0
-# CHECK-NO-MEC: msr S3_4_C10_C8_2, x0
-[0x60,0xa8,0x1c,0xd5]
-# CHECK: msr MECID_A1_EL2,    x0
-# CHECK-NO-MEC: msr S3_4_C10_C8_3, x0
-[0x00,0xa9,0x1c,0xd5]
-# CHECK: msr VMECID_P_EL2,   x0
-# CHECK-NO-MEC: msr S3_4_C10_C9_0, x0
-[0x20,0xa9,0x1c,0xd5]
-# CHECK: msr VMECID_A_EL2,   x0
-# CHECK-NO-MEC: msr S3_4_C10_C9_1, x0
-[0x20,0xaa,0x1e,0xd5]
-# CHECK: msr MECID_RL_A_EL3, x0
-# CHECK-NO-MEC: msr S3_6_C10_C10_1, x0
-[0xe0,0x7e,0x0c,0xd5]
-# CHECK: dc cigdpae, x0
-# CHECK-NO-MEC: sys #4, c7, c14, #7, x0
-[0x00,0x7e,0x0c,0xd5]
-# CHECK: dc cipae, x0
-# CHECK-NO-MEC: sys #4, c7, c14, #0, x0
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt
index d72009b..361c49b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_operands.txt
@@ -32,3 +32,6 @@
 
 # GFX1250: s_setreg_b32 hwreg(HW_REG_XNACK_MASK), s1 ; encoding: [0x22,0xf8,0x01,0xb9]
 0x22,0xf8,0x01,0xb9
+
+# GFX1250: s_setreg_b32 hwreg(HW_REG_IB_STS2), s1  ; encoding: [0x1c,0xf8,0x01,0xb9]
+0x1c,0xf8,0x01,0xb9
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index e7026df..af94fbc 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -35,3 +35,9 @@
 
 # GFX1250: s_monitor_sleep 1                       ; encoding: [0x01,0x00,0x84,0xbf]
 0x01,0x00,0x84,0xbf
+
+# GFX1250: s_sendmsg sendmsg(MSG_SAVEWAVE_HAS_TDM)     ; encoding: [0x0a,0x00,0xb6,0xbf]
+0x0a,0x00,0xb6,0xbf
+
+# GFX1250: s_barrier_wait 0xfffd                   ; encoding: [0xfd,0xff,0x94,0xbf]
+0xfd,0xff,0x94,0xbf
diff --git a/llvm/test/MC/RISCV/rv32p-invalid.s b/llvm/test/MC/RISCV/rv32p-invalid.s
index da3c67b..2ecce5f 100644
--- a/llvm/test/MC/RISCV/rv32p-invalid.s
+++ b/llvm/test/MC/RISCV/rv32p-invalid.s
@@ -1,19 +1,35 @@
 # RUN: not llvm-mc -triple=riscv32 --mattr=+experimental-p %s 2>&1 \
-# RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+# RUN:     | FileCheck %s
 
 # Imm overflow
-pli.h a0, 0x400
-# CHECK-ERROR: immediate must be an integer in the range [-512, 511]
-plui.h a1, 0x400
-# CHECK-ERROR: immediate must be an integer in the range [-512, 1023]
-pli.b a0, 0x200
-# CHECK-ERROR: immediate must be an integer in the range [0, 255]
-
-pslli.b a6, a7, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 7]
-pslli.h ra, sp, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 15]
-psslai.h t0, t1, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 15]
-sslai a4, a5, -1
-# CHECK-ERROR: immediate must be an integer in the range [0, 31]
+pli.h a0, 0x400 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [-512, 511]
+plui.h a1, 0x400 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-512, 1023]
+pli.b a0, 0x200 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [0, 255]
+
+pslli.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+pslli.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+pslli.w ra, sp, 12 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+
+psslai.h t0, t1, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+psslai.w t0, t1, 27 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+sslai a4, a5, -1 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
+
+psrli.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+psrli.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psrli.w ra, sp, 31 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+
+pusati.h ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+pusati.w ra, sp, 0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+usati ra, sp, 100 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
+
+psrai.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+psrai.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psrai.w ra, sp, 10 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+
+psrari.h ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+psrari.w ra, sp, 15 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+srari ra, sp, 100 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
+
+psati.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psati.w ra, sp, 24 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
+sati ra, sp, 100 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
index ffff0f2..1d0fb6d 100644
--- a/llvm/test/MC/RISCV/rv32p-valid.s
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -76,3 +76,33 @@ plui.h gp, 32
 # CHECK-ASM-AND-OBJ: plui.h gp, -412
 # CHECK-ASM: encoding: [0x9b,0x21,0x99,0xf0]
 plui.h gp, 612
+# CHECK-ASM-AND-OBJ: psrli.b a6, a7, 0
+# CHECK-ASM: encoding: [0x1b,0xc8,0x88,0x80]
+psrli.b a6, a7, 0
+# CHECK-ASM-AND-OBJ: psrli.h ra, sp, 1
+# CHECK-ASM: encoding: [0x9b,0x40,0x11,0x81]
+psrli.h ra, sp, 1
+# CHECK-ASM-AND-OBJ: pusati.h t2, t3, 4
+# CHECK-ASM: encoding: [0x9b,0x43,0x4e,0xa1]
+pusati.h t2, t3, 4
+# CHECK-ASM-AND-OBJ: usati t3, t4, 5
+# CHECK-ASM: encoding: [0x1b,0xce,0x5e,0xa2]
+usati t3, t4, 5
+# CHECK-ASM-AND-OBJ: psrai.b a6, a7, 0
+# CHECK-ASM: encoding: [0x1b,0xc8,0x88,0xc0]
+psrai.b a6, a7, 0
+# CHECK-ASM-AND-OBJ: psrai.h ra, sp, 1
+# CHECK-ASM: encoding: [0x9b,0x40,0x11,0xc1]
+psrai.h ra, sp, 1
+# CHECK-ASM-AND-OBJ: psrari.h t4, t5, 6
+# CHECK-ASM: encoding: [0x9b,0x4e,0x6f,0xd1]
+psrari.h t4, t5, 6
+# CHECK-ASM-AND-OBJ: srari t5, t6, 7
+# CHECK-ASM: encoding: [0x1b,0xcf,0x7f,0xd2]
+srari t5, t6, 7
+# CHECK-ASM-AND-OBJ: psati.h t6, s11, 8
+# CHECK-ASM: encoding: [0x9b,0xcf,0x8d,0xe1]
+psati.h t6, s11, 8
+# CHECK-ASM-AND-OBJ: sati s11, s10, 9
+# CHECK-ASM: encoding: [0x9b,0x4d,0x9d,0xe2]
+sati s11, s10, 9
diff --git a/llvm/test/MC/RISCV/rv64p-invalid.s b/llvm/test/MC/RISCV/rv64p-invalid.s
index 572a099..ccccba2e 100644
--- a/llvm/test/MC/RISCV/rv64p-invalid.s
+++ b/llvm/test/MC/RISCV/rv64p-invalid.s
@@ -1,21 +1,35 @@
 # RUN: not llvm-mc -triple=riscv64 --mattr=+experimental-p %s 2>&1 \
-# RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+# RUN:     | FileCheck %s
 
 # Imm overflow
-pli.h a0, 0x400
-# CHECK-ERROR: immediate must be an integer in the range [-512, 511]
-plui.h a1, 0x400
-# CHECK-ERROR: immediate must be an integer in the range [-512, 1023]
-pli.w a1, -0x201
-# CHECK-ERROR: immediate must be an integer in the range [-512, 511]
-
-pslli.b a6, a7, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 7]
-pslli.h ra, sp, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 15]
-pslli.w ra, sp, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 31]
-psslai.h t0, t1, 100
-# CHECK-ERROR: immediate must be an integer in the range [0, 15]
-psslai.w a4, a5, -1
-# CHECK-ERROR: error: immediate must be an integer in the range [0, 31]
+pli.h a0, 0x400 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [-512, 511]
+plui.h a1, 0x400 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-512, 1023]
+pli.w a1, -0x201 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [-512, 511]
+
+pslli.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+pslli.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+pslli.w ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 31]
+
+psslai.h t0, t1, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+psslai.w a4, a5, -1 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 31]
+sslai ra, sp, 10 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set
+
+psrli.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+psrli.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psrli.w ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 31]
+
+pusati.h ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+pusati.w ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 31]
+usati ra, sp, 100 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 63]
+
+psrai.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7]
+psrai.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psrai.w ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 31]
+
+psrari.h ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 15]
+psrari.w ra, sp, 100 # CHECK: :[[@LINE]]:18: error: immediate must be an integer in the range [0, 31]
+srari ra, sp, 100 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 63]
+
+psati.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15]
+psati.w ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 31]
+sati ra, sp, 100 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 63]
diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s
index a0d6ead..13cfd5e 100644
--- a/llvm/test/MC/RISCV/rv64p-valid.s
+++ b/llvm/test/MC/RISCV/rv64p-valid.s
@@ -55,7 +55,7 @@ max t3, t4, t5
 # CHECK-ASM-AND-OBJ: maxu a4, a5, a6
 # CHECK-ASM: encoding: [0x33,0xf7,0x07,0x0b]
 maxu a4, a5, a6
-# CHECK-ASM-AND-OBJ: pslli.b a6, a7
+# CHECK-ASM-AND-OBJ: pslli.b a6, a7, 0
 # CHECK-ASM: encoding: [0x1b,0xa8,0x88,0x80]
 pslli.b a6, a7, 0
 # CHECK-ASM-AND-OBJ: pslli.h ra, sp, 1
@@ -106,3 +106,48 @@ plui.w a2, 1
 # CHECK-ASM-AND-OBJ: plui.w a2, -1
 # CHECK-ASM: encoding: [0x1b,0xa6,0xff,0xf3]
 plui.w a2, 1023
+# CHECK-ASM-AND-OBJ: psrli.b a6, a7
+# CHECK-ASM: encoding: [0x1b,0xc8,0x88,0x80]
+psrli.b a6, a7, 0
+# CHECK-ASM-AND-OBJ: psrli.h ra, sp, 1
+# CHECK-ASM: encoding: [0x9b,0x40,0x11,0x81]
+psrli.h ra, sp, 1
+# CHECK-ASM-AND-OBJ: psrli.w ra, sp, 2
+# CHECK-ASM: encoding: [0x9b,0x40,0x21,0x82]
+psrli.w ra, sp, 2
+# CHECK-ASM-AND-OBJ: pusati.h t2, t3, 4
+# CHECK-ASM: encoding: [0x9b,0x43,0x4e,0xa1]
+pusati.h t2, t3, 4
+# CHECK-ASM-AND-OBJ: pusati.w t2, t3, 5
+# CHECK-ASM: encoding: [0x9b,0x43,0x5e,0xa2]
+pusati.w t2, t3, 5
+# CHECK-ASM-AND-OBJ: usati t3, t4, 5
+# CHECK-ASM: encoding: [0x1b,0xce,0x5e,0xa4]
+usati t3, t4, 5
+# CHECK-ASM-AND-OBJ: psrai.b a6, a7, 0
+# CHECK-ASM: encoding: [0x1b,0xc8,0x88,0xc0]
+psrai.b a6, a7, 0
+# CHECK-ASM-AND-OBJ: psrai.h ra, sp, 1
+# CHECK-ASM: encoding: [0x9b,0x40,0x11,0xc1]
+psrai.h ra, sp, 1
+# CHECK-ASM-AND-OBJ: psrai.w ra, sp, 2
+# CHECK-ASM: encoding: [0x9b,0x40,0x21,0xc2]
+psrai.w ra, sp, 2
+# CHECK-ASM-AND-OBJ: psrari.h t4, t5, 6
+# CHECK-ASM: encoding: [0x9b,0x4e,0x6f,0xd1]
+psrari.h t4, t5, 6
+# CHECK-ASM-AND-OBJ: psrari.w t5, t6, 7
+# CHECK-ASM: encoding: [0x1b,0xcf,0x7f,0xd2]
+psrari.w t5, t6, 7
+# CHECK-ASM-AND-OBJ: srari t6, s11, 63
+# CHECK-ASM: encoding: [0x9b,0xcf,0xfd,0xd7]
+srari t6, s11, 63
+# CHECK-ASM-AND-OBJ: psati.h s11, s10, 9
+# CHECK-ASM: encoding: [0x9b,0x4d,0x9d,0xe1]
+psati.h s11, s10, 9
+# CHECK-ASM-AND-OBJ: psati.w s10, s9, 10
+# CHECK-ASM: encoding: [0x1b,0xcd,0xac,0xe2]
+psati.w s10, s9, 10
+# CHECK-ASM-AND-OBJ: sati s9, s8, 32
+# CHECK-ASM: encoding: [0x9b,0x4c,0x0c,0xe6]
+sati s9, s8, 32
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
index 7fe63b1..ef918d4 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
@@ -20,18 +20,18 @@ def Test0 : GICombineRule<
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_MUL),
 // CHECK-NEXT:       // MIs[0] dst
-// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/0, /*TempTypeIdx*/uint8_t(-1),
+// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/0, /*TempTypeIdx*/255,
 // CHECK-NEXT:       // MIs[0] src
-// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/1, /*TempTypeIdx*/uint8_t(-2),
+// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/1, /*TempTypeIdx*/254,
 // CHECK-NEXT:       // MIs[0] Operand 2
-// CHECK-NEXT:       GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-1),
-// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/uint8_t(-2),
+// CHECK-NEXT:       GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 255,
+// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/254,
 // CHECK-NEXT:       GIR_BuildConstant, /*TempRegID*/1, /*Val*/GIMT_Encode8(0),
-// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/uint8_t(-1),
+// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/255,
 // CHECK-NEXT:       // Combiner Rule #0: Test0
 // CHECK-NEXT:       GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(TargetOpcode::G_CONSTANT),
 // CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/GIMT_Encode2(RegState::Define),
-// CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/uint8_t(-2), /*Imm*/GIMT_Encode8(42),
+// CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/254, /*Imm*/GIMT_Encode8(42),
 // CHECK-NEXT:       GIR_BuildMI, /*InsnID*/1, /*Opcode*/GIMT_Encode2(TargetOpcode::G_SUB),
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/0, // dst
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/1, /*TempRegID*/1,
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 92baab9..8907cfe 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -181,7 +181,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1]
 // CHECK-NEXT:       GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_CONSTANT),
 // CHECK-NEXT:       // MIs[1] z
-// CHECK-NEXT:       GIM_CheckLiteralInt, /*MI*/1, /*Op*/1, GIMT_Encode8(-42),
+// CHECK-NEXT:       GIM_CheckLiteralInt, /*MI*/1, /*Op*/1, GIMT_Encode8(18446744073709551574u),
 // CHECK-NEXT:       GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 43,
 // CHECK-NEXT:       GIM_CheckIsSafeToFold, /*NumInsns*/1,
 // CHECK-NEXT:       // Combiner Rule #5: InOutInstTest1
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index 4a516c6..7a86b5b 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -617,11 +617,11 @@ def MOV : I<(outs GPR32:$dst), (ins GPR32:$src1),
 // R02N-NEXT:    // MIs[0] Operand 2
 // R02N-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
 //
-// R02C-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-2)
+// R02C-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 254,
 // R02C-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$src1, -2:{ *:[i32] }) => (XORI:{ *:[i32] } GPR32:{ *:[i32] }:$src1)
 // R02C-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::XORI),
 // R02C-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// R02C-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/uint8_t(-1),
+// R02C-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/255,
 // R02C-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // src1
 // R02C-NEXT:    GIR_RootConstrainSelectedInstOperands,
 // R02C-NEXT:    // GIR_Coverage, 2,
@@ -648,7 +648,7 @@ def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1),
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // NOOPT-NEXT:    // MIs[0] Operand 2
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-3)
+// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 253,
 // NOOPT-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$src1, -3:{ *:[i32] }) => (XOR:{ *:[i32] } GPR32:{ *:[i32] }:$src1)
 // NOOPT-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::XOR),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, //  DstI[dst]
@@ -676,11 +676,11 @@ def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1),
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // NOOPT-NEXT:    // MIs[0] Operand 2
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-4)
+// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 252,
 // NOOPT-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$src1, -4:{ *:[i32] }) => (XORlike:{ *:[i32] } GPR32:{ *:[i32] }:$src1)
 // NOOPT-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::XORlike),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// NOOPT-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/uint8_t(-1),
+// NOOPT-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/255,
 // NOOPT-NEXT:    GIR_AddRegister, /*InsnID*/0, GIMT_Encode2(MyTarget::R0),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // src1
 // NOOPT-NEXT:    GIR_RootConstrainSelectedInstOperands,
@@ -705,11 +705,11 @@ def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1),
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // NOOPT-NEXT:    // MIs[0] Operand 2
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-5),
+// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 251,
 // NOOPT-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$src1, -5:{ *:[i32] }) => (XORManyDefaults:{ *:[i32] } GPR32:{ *:[i32] }:$src1)
 // NOOPT-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::XORManyDefaults),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
-// NOOPT-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/uint8_t(-1),
+// NOOPT-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/255,
 // NOOPT-NEXT:    GIR_AddRegister, /*InsnID*/0, GIMT_Encode2(MyTarget::R0),
 // NOOPT-NEXT:    GIR_AddRegister, /*InsnID*/0, GIMT_Encode2(MyTarget::R0),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // src1
@@ -735,7 +735,7 @@ def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1)
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // NOOPT-NEXT:    // MIs[0] Operand 2
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-6)
+// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 250,
 // NOOPT-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$src1, -6:{ *:[i32] }) => (XORIb:{ *:[i32] } GPR32:{ *:[i32] }:$src1)
 // NOOPT-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::XORIb),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
@@ -766,7 +766,7 @@ def XORIb : I<(outs GPR32:$dst), (ins mb:$src2, GPR32:$src1),
 // NOOPT-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
 // NOOPT-NEXT:    // MIs[0] Operand 2
 // NOOPT-NEXT:    GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s32,
-// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, uint8_t(-1),
+// NOOPT-NEXT:    GIM_CheckConstantInt8, /*MI*/0, /*Op*/2, 255,
 // NOOPT-NEXT:    // (xor:{ *:[i32] } GPR32:{ *:[i32] }:$Wm, -1:{ *:[i32] }) => (ORN:{ *:[i32] } R0:{ *:[i32] }, GPR32:{ *:[i32] }:$Wm)
 // NOOPT-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ORN),
 // NOOPT-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, //  DstI[dst]
diff --git a/llvm/test/TableGen/GlobalISelEmitter/int64min.td b/llvm/test/TableGen/GlobalISelEmitter/int64min.td
new file mode 100644
index 0000000..ccdb749
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter/int64min.td
@@ -0,0 +1,30 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../../include -I %p/../Common %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def GPR : RegisterClass<"MyTarget", [i64], 64, (add R0)>;
+def ANDI : I<(outs GPR:$dst), (ins GPR:$src1, i64imm:$src2), []>;
+
+// CHECK-LABEL: GIM_Try, /*On fail goto*//*Label 0*/ GIMT_Encode4(59), // Rule ID 0 //
+// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
+// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_AND),
+// CHECK-NEXT: // MIs[0] DstI[dst]
+// CHECK-NEXT: GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_s64,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
+// CHECK-NEXT: // MIs[0] rs1
+// CHECK-NEXT: GIM_RootCheckType, /*Op*/1, /*Type*/GILLT_s64,
+// CHECK-NEXT: GIM_RootCheckRegBankForClass, /*Op*/1, /*RC*/GIMT_Encode2(MyTarget::GPRRegClassID),
+// CHECK-NEXT: // MIs[0] Operand 2
+// CHECK-NEXT: GIM_RootCheckType, /*Op*/2, /*Type*/GILLT_s64,
+// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, GIMT_Encode8(9223372036854775808u),
+// CHECK-NEXT: // (and:{ *:[i64] } GPR:{ *:[i64] }:$rs1, -9223372036854775808:{ *:[i64] })  =>  (ANDI:{ *:[i64] } GPR:{ *:[i64] }:$rs1, -9223372036854775808:{ *:[i64] })
+// CHECK-NEXT: GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ANDI),
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// CHECK-NEXT: GIR_RootToRootCopy, /*OpIdx*/1, // rs1
+// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/GIMT_Encode8(9223372036854775808u),
+// CHECK-NEXT: GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT: // GIR_Coverage, 0,
+// CHECK-NEXT: GIR_EraseRootFromParent_Done,
+def : Pat<(and GPR:$rs1, 0x8000000000000000),
+          (ANDI GPR:$rs1, 0x8000000000000000)>;
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
index feef075..fcc5f7e 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
@@ -41,7 +41,7 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 >;
 
 
-// CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, FloatABI::ABIType FloatABI) {
+// CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
 // CHECK: if (TT.getArch() == Triple::avr && TT.isOSHurd()) {
 // CHECK-NEXT:   const CallingConv::ID DefaultCC = isFoo() ? CallingConv::Fast : CallingConv::GHC;
 // CHECK-NEXT:   for (CallingConv::ID &Entry : LibcallImplCallingConvs) {
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 59ccd23..a2d946f 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -150,7 +150,7 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: };
 
 
-// CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, FloatABI::ABIType FloatABI) {
+// CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
 // CHECK-NEXT:  struct LibcallImplPair {
 // CHECK-NEXT:    RTLIB::Libcall Func;
 // CHECK-NEXT:    RTLIB::LibcallImpl Impl;
diff --git a/llvm/test/Transforms/Coroutines/coro-async.ll b/llvm/test/Transforms/Coroutines/coro-async.ll
index e5d2e6c..331d6a6 100644
--- a/llvm/test/Transforms/Coroutines/coro-async.ll
+++ b/llvm/test/Transforms/Coroutines/coro-async.ll
@@ -496,6 +496,35 @@ entry:
 ; CHECK: call void @use(ptr null)
 ; CHECK: ret
 
+@simpleFuncTu = global <{i32, i32}> <{
+  i32 trunc (i64 sub (i64 ptrtoint (ptr @simpleFunc to i64),
+             i64 ptrtoint (ptr @simpleFuncTu to i64)) to i32), i32 16 }>
+
+define swifttailcc void @simpleFunc(ptr swiftasync %0) presplitcoroutine {
+entry:
+  %1 = alloca ptr, align 8
+  %2 = call token @llvm.coro.id.async(i32 16, i32 16, i32 0, ptr @simpleFuncTu)
+  %3 = call ptr @llvm.coro.begin(token %2, ptr null)
+  store ptr %0, ptr %1, align 8
+  %4 = load ptr, ptr %1, align 8
+  %5 = getelementptr inbounds <{ ptr, ptr }>, ptr %4, i32 0, i32 1
+  %6 = load ptr, ptr %5, align 8
+  %7 = load ptr, ptr %1, align 8
+  %8 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @simpleFunc.0, ptr %6, ptr %7)
+  unreachable
+}
+
+; CHECK-LABEL: define swifttailcc void @simpleFunc(ptr swiftasync %0) {
+; CHECK-NOT: define
+; CHECK:  [[RESUME:%.*]] = load ptr
+; CHECK:  musttail call swifttailcc void [[RESUME]]
+
+define internal swifttailcc void @simpleFunc.0(ptr %0, ptr %1) alwaysinline {
+entry:
+  musttail call swifttailcc void %0(ptr swiftasync %1)
+  ret void
+}
+
 declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32, ptr, ptr, ...)
 declare ptr @llvm.coro.prepare.async(ptr)
 declare token @llvm.coro.id.async(i32, i32, i32, ptr)
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 1a41c1f..cb42809 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -3300,3 +3300,149 @@ entry:
   %cmp = icmp ult i32 %add, 253
   ret i1 %cmp
 }
+
+; PR 152851
+
+define i1 @val_is_aligend_const_pow2(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_pow2_add_commute(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_add_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32  4095, %num
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_pow2_and_commute(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_and_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 -4096, %num.biased
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_pow2_icm_commute(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_icm_commute(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp eq i32 %num, %num.masked
+  ret i1 %_0
+}
+
+; Should not work for non-power-of-two cases
+define i1 @val_is_aligend_const_non_pow2(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_non_pow2(
+; CHECK-NEXT:    [[NUM_BIASED:%.*]] = add i32 [[NUM:%.*]], 6
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = and i32 [[NUM_BIASED]], -7
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 6
+  %num.masked = and i32 %num.biased, -7
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_pow2_multiuse(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_multiuse(
+; CHECK-NEXT:    [[NUM_BIASED:%.*]] = add i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = and i32 [[NUM_BIASED]], -4096
+; CHECK-NEXT:    call void @use(i32 [[NUM_MASKED]])
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 %num.biased, -4096
+  call void @use(i32 %num.masked)
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+; Applies since number of instructions do not change
+define i1 @val_is_aligend_const_pow2_multiuse1(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_multiuse1(
+; CHECK-NEXT:    [[NUM_BIASED:%.*]] = add i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    call void @use(i32 [[NUM_BIASED]])
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = and i32 [[NUM_BIASED]], -4096
+; CHECK-NEXT:    [[_0:%.*]] = icmp eq i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  call void @use(i32 %num.biased)
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp eq i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_pow2_ne(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_pow2_ne(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp ne i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_mismatch(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_mismatch(
+; CHECK-NEXT:    [[NUM_BIASED:%.*]] = add i32 [[NUM:%.*]], 4095
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = and i32 [[NUM_BIASED]], -4095
+; CHECK-NEXT:    [[_0:%.*]] = icmp ne i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4095
+  %num.masked = and i32 %num.biased, -4095
+  %_0 = icmp ne i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_const_mismatch1(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_const_mismatch1(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], -4096
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = add i32 [[TMP1]], 4096
+; CHECK-NEXT:    [[_0:%.*]] = icmp ne i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4096
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp ne i32 %num.masked, %num
+  ret i1 %_0
+}
+
+define i1 @val_is_aligend_pred_mismatch(i32 %num) {
+; CHECK-LABEL: @val_is_aligend_pred_mismatch(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[NUM:%.*]], -4096
+; CHECK-NEXT:    [[NUM_MASKED:%.*]] = add i32 [[TMP1]], 4096
+; CHECK-NEXT:    [[_0:%.*]] = icmp sge i32 [[NUM_MASKED]], [[NUM]]
+; CHECK-NEXT:    ret i1 [[_0]]
+;
+  %num.biased = add i32 %num, 4096
+  %num.masked = and i32 %num.biased, -4096
+  %_0 = icmp sge i32 %num.masked, %num
+  ret i1 %_0
+}
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
index 1b13477..724f459 100644
--- a/llvm/test/Transforms/LICM/hoist-binop.ll
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -22,6 +22,31 @@ loop:
   br label %loop
 }
 
+; Don't hoist ADD if the op has more than one use.
+define void @add_two_uses(i64 %c1, i64 %c2) {
+; CHECK-LABEL: @add_two_uses(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[STEP_ADD]], [[C2:%.*]]
+; CHECK-NEXT:    call void @use(i64 [[INDEX_NEXT]])
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
+  %step.add = add i64 %index, %c1
+  call void @use(i64 %step.add)
+  %index.next = add i64 %step.add, %c2
+  call void @use(i64 %index.next)
+  br label %loop
+}
+
 ; Hoist MUL and remove old op if unused.
 define void @mul_one_use(i64 %c1, i64 %c2) {
 ; CHECK-LABEL: @mul_one_use(
@@ -51,8 +76,6 @@ define void @add_nuw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -62,7 +85,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nuw i64 %step.add, %c2
   br label %loop
 }
@@ -76,8 +98,6 @@ define void @add_nuw_comm(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -87,7 +107,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw i64 %c1, %index
-  call void @use(i64 %step.add)
   %index.next = add nuw i64 %step.add, %c2
   br label %loop
 }
@@ -101,8 +120,6 @@ define void @add_nuw_comm2(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -112,7 +129,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nuw i64 %c2, %step.add
   br label %loop
 }
@@ -126,8 +142,6 @@ define void @add_nuw_comm3(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -137,7 +151,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw i64 %c1, %index
-  call void @use(i64 %step.add)
   %index.next = add nuw i64 %c2, %step.add
   br label %loop
 }
@@ -152,8 +165,6 @@ define void @add_nuw_twobinops(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -163,135 +174,134 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw i64 %c1, %index
-  call void @use(i64 %step.add)
   %c2.plus.2 = add nuw i64 %c2, 2
   %index.next = add nuw i64 %step.add, %c2.plus.2
   br label %loop
 }
 
 ; Hoist MUL and drop NUW even if both ops have it.
-define void @mul_nuw(i64 %c1, i64 %c2) {
+define void @mul_nuw(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_nuw(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw <2 x i64> [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nuw i64 %index, %c1
-  call void @use(i64 %step.add)
-  %index.next = mul nuw i64 %step.add, %c2
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw <2 x i64> %index, %c1
+  call void @use(<2 x i64> %step.add)
+  %index.next = mul nuw <2 x i64> %step.add, %c2
   br label %loop
 }
 
 ; Hoist MUL and drop NUW even if both ops have it.
 ; Version where operands are commuted.
-define void @mul_nuw_comm(i64 %c1, i64 %c2) {
+define void @mul_nuw_comm(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_nuw_comm(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw <2 x i64> [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nuw i64 %c1, %index
-  call void @use(i64 %step.add)
-  %index.next = mul nuw i64 %step.add, %c2
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw <2 x i64> %c1, %index
+  call void @use(<2 x i64> %step.add)
+  %index.next = mul nuw <2 x i64> %step.add, %c2
   br label %loop
 }
 
 ; Hoist MUL and drop NUW even if both ops have it.
 ; Another version where operands are commuted.
-define void @mul_nuw_comm2(i64 %c1, i64 %c2) {
+define void @mul_nuw_comm2(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_nuw_comm2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw <2 x i64> [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nuw i64 %index, %c1
-  call void @use(i64 %step.add)
-  %index.next = mul nuw i64 %c2, %step.add
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw <2 x i64> %index, %c1
+  call void @use(<2 x i64> %step.add)
+  %index.next = mul nuw <2 x i64> %c2, %step.add
   br label %loop
 }
 
 ; Hoist MUL and drop NUW even if both ops have it.
 ; Another version where operands are commuted.
-define void @mul_nuw_comm3(i64 %c1, i64 %c2) {
+define void @mul_nuw_comm3(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_nuw_comm3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw <2 x i64> [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nuw i64 %c1, %index
-  call void @use(i64 %step.add)
-  %index.next = mul nuw i64 %c2, %step.add
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw <2 x i64> %c1, %index
+  call void @use(<2 x i64> %step.add)
+  %index.next = mul nuw <2 x i64> %c2, %step.add
   br label %loop
 }
 
 ; Hoist MUL and drop NUW even if both ops have it.
 ; A version where the LHS and RHS of the outer BinOp are BinOps.
-define void @mul_nuw_twobinops(i64 %c1, i64 %c2) {
+define void @mul_nuw_twobinops(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_nuw_twobinops(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C2_PLUS_2:%.*]] = add nuw i64 [[C2:%.*]], 2
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2_PLUS_2]]
+; CHECK-NEXT:    [[C2_PLUS_2:%.*]] = add nuw <2 x i64> [[C2:%.*]], splat (i64 2)
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2_PLUS_2]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw i64 [[C1]], [[INDEX]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nuw <2 x i64> [[C1]], [[INDEX]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nuw i64 %c1, %index
-  call void @use(i64 %step.add)
-  %c2.plus.2 = add nuw i64 %c2, 2
-  %index.next = mul nuw i64 %step.add, %c2.plus.2
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nuw <2 x i64> %c1, %index
+  call void @use(<2 x i64> %step.add)
+  %c2.plus.2 = add nuw <2 x i64> %c2, <i64 2, i64 2>
+  %index.next = mul nuw <2 x i64> %step.add, %c2.plus.2
   br label %loop
 }
 
@@ -303,8 +313,6 @@ define void @add_no_nuw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -314,7 +322,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nuw i64 %step.add, %c2
   br label %loop
 }
@@ -327,8 +334,6 @@ define void @add_no_nsw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -338,7 +343,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nsw i64 %step.add, %c2
   br label %loop
 }
@@ -351,8 +355,6 @@ define void @add_no_nsw_2(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -362,7 +364,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nsw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nsw i64 %step.add, %c2
   br label %loop
 }
@@ -375,8 +376,6 @@ define void @add_nuw_nsw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw nsw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add nuw nsw i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -386,7 +385,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw nsw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nuw nsw i64 %step.add, %c2
   br label %loop
 }
@@ -398,8 +396,6 @@ define void @add_both_nsw_first_nuw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nuw nsw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -409,7 +405,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nuw nsw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nsw i64 %step.add, %c2
   br label %loop
 }
@@ -421,8 +416,6 @@ define void @add_both_nsw_second_nuw(i64 %c1, i64 %c2) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add nsw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = add i64 [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -432,33 +425,32 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add nsw i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = add nuw nsw i64 %step.add, %c2
   br label %loop
 }
 
 ;
 ; Hoist MUL and drop NSW even if both ops have it.
-define void @mul_no_nsw_2(i64 %c1, i64 %c2) {
+define void @mul_no_nsw_2(<2 x i64> %c1, <2 x i64> %c2) {
 ; CHECK-LABEL: @mul_no_nsw_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul i64 [[C1:%.*]], [[C2:%.*]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = mul <2 x i64> [[C1:%.*]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nsw i64 [[INDEX]], [[C1]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
-; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul i64 [[INDEX]], [[INVARIANT_OP]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_REASS:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = mul nsw <2 x i64> [[INDEX]], [[C1]]
+; CHECK-NEXT:    call void @use(<2 x i64> [[STEP_ADD]])
+; CHECK-NEXT:    [[INDEX_NEXT_REASS]] = mul <2 x i64> [[INDEX]], [[INVARIANT_OP]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
 
 loop:
-  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
-  %step.add = mul nsw i64 %index, %c1
-  call void @use(i64 %step.add)
-  %index.next = mul nsw i64 %step.add, %c2
+  %index = phi <2 x i64> [ zeroinitializer, %entry ], [ %index.next, %loop ]
+  %step.add = mul nsw <2 x i64> %index, %c1
+  call void @use(<2 x i64> %step.add)
+  %index.next = mul nsw <2 x i64> %step.add, %c2
   br label %loop
 }
 
@@ -470,7 +462,6 @@ define void @diff_ops(i64 %c1, i64 %c2) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add i64 [[INDEX]], [[C1:%.*]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = mul i64 [[STEP_ADD]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -480,7 +471,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = add i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = mul i64 %step.add, %c2
   br label %loop
 }
@@ -493,7 +483,6 @@ define void @noassoc_ops(i64 %c1, i64 %c2) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = sub i64 [[INDEX]], [[C1:%.*]]
-; CHECK-NEXT:    call void @use(i64 [[STEP_ADD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = sub i64 [[STEP_ADD]], [[C2:%.*]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -503,7 +492,6 @@ entry:
 loop:
   %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
   %step.add = sub i64 %index, %c1
-  call void @use(i64 %step.add)
   %index.next = sub i64 %step.add, %c2
   br label %loop
 }
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index d1cf3de..59dea58 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -97,7 +97,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT:    [[ADD_REASS]] = add i32 [[I_ADDR]], 2
+; CHECK-NEXT:    [[ADD_REASS]] = add nsw i32 [[ADD_I]], 1
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
 ; CHECK:       loopexit0:
 ; CHECK-NEXT:    [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
index e303d04..8f90453 100644
--- a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
+++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
@@ -4,7 +4,7 @@
 define i16 @main() {
 ; SCEV-EXPR-LABEL: 'main'
 ; SCEV-EXPR-NEXT:  Classifying expressions for: @main
-; SCEV-EXPR-NEXT:    %mul = phi i16 [ 1, %entry ], [ %mul.n.3.reass, %loop ]
+; SCEV-EXPR-NEXT:    %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ]
 ; SCEV-EXPR-NEXT:    --> %mul U: [0,-15) S: [-32768,32753) Exits: 4096 LoopDispositions: { %loop: Variant }
 ; SCEV-EXPR-NEXT:    %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ]
 ; SCEV-EXPR-NEXT:    --> %div U: [-2048,-32768) S: [-2048,-32768) Exits: 7 LoopDispositions: { %loop: Variant }
@@ -16,7 +16,7 @@ define i16 @main() {
 ; SCEV-EXPR-NEXT:    --> %div.n.1 U: [-8192,8192) S: [-8192,8192) Exits: 1 LoopDispositions: { %loop: Variant }
 ; SCEV-EXPR-NEXT:    %div.n.2 = sdiv i16 %div.n.1, 2
 ; SCEV-EXPR-NEXT:    --> %div.n.2 U: [-4096,4096) S: [-4096,4096) Exits: 0 LoopDispositions: { %loop: Variant }
-; SCEV-EXPR-NEXT:    %mul.n.3.reass = mul i16 %mul, 16
+; SCEV-EXPR-NEXT:    %mul.n.3 = mul i16 %mul.n.reass.reass, 2
 ; SCEV-EXPR-NEXT:    --> (16 * %mul) U: [0,-15) S: [-32768,32753) Exits: 0 LoopDispositions: { %loop: Variant }
 ; SCEV-EXPR-NEXT:    %div.n.3 = sdiv i16 %div.n.2, 2
 ; SCEV-EXPR-NEXT:    --> %div.n.3 U: [-2048,2048) S: [-2048,2048) Exits: 0 LoopDispositions: { %loop: Variant }
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
index 901f228..ee6da8f 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -1,87 +1,91 @@
-; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF1
-; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF2
 
-; CHECKUF1: for.body.preheader:
-; CHECKUF1-DAG: %wide.trip.count = zext nneg i32 %N to i64
-; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2
-; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
-
-; CHECKUF1: vector.ph:
-; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2
-; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
-; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
-
-; CHECKUF1: vector.body:
-; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
-; CHECKUF1: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
-; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, splat (double 1.000000e+00)
-; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
-; CHECKUF1: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
-; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]]
-; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
-; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
-
-
-; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
+; For an interleave factor of 2, vscale is scaled by 8 instead of 4.
 ; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
-
-; CHECKUF2: for.body.preheader:
-; CHECKUF2-DAG: %wide.trip.count = zext nneg i32 %N to i64
-; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3
-; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
-
-; CHECKUF2: vector.ph:
-; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3
-; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
-; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
-
-; CHECKUF2: vector.body:
-; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
-; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
-; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXB]], i64 %[[VSCALE2]]
-; CHECKUF2: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
-; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, ptr %[[IDXB_NEXT]], align 8
-; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, splat (double 1.000000e+00)
-; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, splat (double 1.000000e+00)
-; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
-; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 5
-; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds i8, ptr %[[IDXA]], i64 %[[VSCALE2]]
-; CHECKUF2: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
-; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8
-; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]
-; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
-; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
-
-define void @loop(i32 %N, ptr nocapture %a, ptr nocapture readonly %b) {
+define void @loop(i64 %N, ptr noalias %a, ptr noalias %b) {
+; CHECKUF1-LABEL: define void @loop(
+; CHECKUF1-SAME: i64 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECKUF1-NEXT:  [[ENTRY:.*:]]
+; CHECKUF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECKUF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECKUF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECKUF1:       [[VECTOR_PH]]:
+; CHECKUF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
+; CHECKUF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECKUF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECKUF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECKUF1:       [[VECTOR_BODY]]:
+; CHECKUF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECKUF1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECKUF1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP7]], align 8
+; CHECKUF1-NEXT:    [[TMP8:%.*]] = fadd <vscale x 4 x double> [[WIDE_LOAD]], splat (double 1.000000e+00)
+; CHECKUF1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECKUF1-NEXT:    store <vscale x 4 x double> [[TMP8]], ptr [[TMP9]], align 8
+; CHECKUF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECKUF1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECKUF1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECKUF1:       [[MIDDLE_BLOCK]]:
+; CHECKUF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECKUF1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECKUF1:       [[SCALAR_PH]]:
+;
+; CHECKUF2-LABEL: define void @loop(
+; CHECKUF2-SAME: i64 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECKUF2-NEXT:  [[ENTRY:.*:]]
+; CHECKUF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECKUF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECKUF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECKUF2:       [[VECTOR_PH]]:
+; CHECKUF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
+; CHECKUF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECKUF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECKUF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECKUF2:       [[VECTOR_BODY]]:
+; CHECKUF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECKUF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECKUF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECKUF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i64 [[TMP16]]
+; CHECKUF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP7]], align 8
+; CHECKUF2-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x double>, ptr [[TMP9]], align 8
+; CHECKUF2-NEXT:    [[TMP10:%.*]] = fadd <vscale x 4 x double> [[WIDE_LOAD]], splat (double 1.000000e+00)
+; CHECKUF2-NEXT:    [[TMP11:%.*]] = fadd <vscale x 4 x double> [[WIDE_LOAD3]], splat (double 1.000000e+00)
+; CHECKUF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECKUF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECKUF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 [[TMP17]]
+; CHECKUF2-NEXT:    store <vscale x 4 x double> [[TMP10]], ptr [[TMP12]], align 8
+; CHECKUF2-NEXT:    store <vscale x 4 x double> [[TMP11]], ptr [[TMP14]], align 8
+; CHECKUF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECKUF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECKUF2-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECKUF2:       [[MIDDLE_BLOCK]]:
+; CHECKUF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECKUF2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECKUF2:       [[SCALAR_PH]]:
+;
 entry:
-  %cmp7 = icmp sgt i32 %N, 0
-  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
+  br label %loop
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds double, ptr %b, i64 %indvars.iv
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds double, ptr %b, i64 %iv
   %0 = load double, ptr %arrayidx, align 8
   %add = fadd double %0, 1.000000e+00
-  %arrayidx2 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds double, ptr %a, i64 %iv
   store double %add, ptr %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
 }
 
 !1 = distinct !{!1, !2}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
new file mode 100644
index 0000000..fa25af5
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx15.0.0"
+
+define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16 noundef signext %s) nofree nosync {
+; CHECK-LABEL: define i64 @std_find_i16_constant_offset_with_assumptions(
+; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FIRST_COERCE]] to i64
+; CHECK-NEXT:    [[COERCE_VAL_PI_I:%.*]] = add i64 [[TMP0]], 256
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[COERCE_VAL_PI_I]] to ptr
+; CHECK-NEXT:    [[CMP_NOT6_I_I:%.*]] = icmp eq ptr [[FIRST_COERCE]], [[COERCE_VAL_IP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT6_I_I]], label %[[RETURN:.*]], label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST_COERCE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PTR_IV]], align 2
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp eq i16 [[TMP1]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 2
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[COERCE_VAL_IP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT:    [[MERGE_PH:%.*]] = phi ptr [ [[COERCE_VAL_IP]], %[[LOOP_LATCH]] ], [ [[PTR_IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[MERGE_PH]] to i64
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RES_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE]], %[[RETURN_LOOPEXIT]] ], [ [[TMP0]], %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RES_PRE_PHI]]
+;
+entry:
+  %first = alloca { ptr }, align 8
+  %s.addr = alloca i16, align 2
+  store ptr %first.coerce, ptr %first, align 8
+  store i16 %s, ptr %s.addr, align 2
+  %0 = load ptr, ptr %first, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 2) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %0, i64 256) ]
+  %start.ptr = load ptr, ptr %first, align 8
+  %1 = load i64, ptr %first, align 8
+  %coerce.val.pi.i = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
+  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+
+loop.ph:
+  %2 = load i16, ptr %s.addr, align 2
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
+  %3 = load i16, ptr %ptr.iv, align 2
+  %cmp2.i.i = icmp eq i16 %3, %2
+  br i1 %cmp2.i.i, label %return, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %cmp.not.i.i, label %return, label %loop.header
+
+return:
+  %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
+  %res = ptrtoint ptr %merge to i64
+  ret i64 %res
+}
+
+define i64 @std_find_i16_constant_offset_no_assumptions(ptr %first.coerce, i16 noundef signext %s) nofree nosync {
+; CHECK-LABEL: define i64 @std_find_i16_constant_offset_no_assumptions(
+; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FIRST_COERCE]] to i64
+; CHECK-NEXT:    [[COERCE_VAL_PI_I:%.*]] = add i64 [[TMP0]], 256
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[COERCE_VAL_PI_I]] to ptr
+; CHECK-NEXT:    [[CMP_NOT6_I_I:%.*]] = icmp eq ptr [[FIRST_COERCE]], [[COERCE_VAL_IP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT6_I_I]], label %[[RETURN:.*]], label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST_COERCE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PTR_IV]], align 2
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp eq i16 [[TMP1]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 2
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[COERCE_VAL_IP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT:    [[MERGE_PH:%.*]] = phi ptr [ [[COERCE_VAL_IP]], %[[LOOP_LATCH]] ], [ [[PTR_IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[MERGE_PH]] to i64
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RES_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE]], %[[RETURN_LOOPEXIT]] ], [ [[TMP0]], %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RES_PRE_PHI]]
+;
+entry:
+  %first = alloca { ptr }, align 8
+  %s.addr = alloca i16, align 2
+  store ptr %first.coerce, ptr %first, align 8
+  store i16 %s, ptr %s.addr, align 2
+  %0 = load ptr, ptr %first, align 8
+  %start.ptr = load ptr, ptr %first, align 8
+  %1 = load i64, ptr %first, align 8
+  %coerce.val.pi.i = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
+  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+
+loop.ph:
+  %2 = load i16, ptr %s.addr, align 2
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
+  %3 = load i16, ptr %ptr.iv, align 2
+  %cmp2.i.i = icmp eq i16 %3, %2
+  br i1 %cmp2.i.i, label %return, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %cmp.not.i.i, label %return, label %loop.header
+
+return:
+  %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
+  %res = ptrtoint ptr %merge to i64
+  ret i64 %res
+}
+
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependent-schedule.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependent-schedule.ll
new file mode 100644
index 0000000..8602c25
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependent-schedule.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(i32 %0, i32 %1) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[K:%.*]] = alloca [4 x i32], align 16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[ADD1]], -1
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (ptr, ...) @printf(ptr null, i32 [[ADD1]])
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[SUB3:%.*]] = add i32 [[ADD2]], [[CALL]]
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[SUB3]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[ADD4]], ptr [[K]], align 16
+; CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr i8, ptr [[K]], i64 4
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYINIT_ELEMENT]], align 4
+; CHECK-NEXT:    [[ARRAYINIT_ELEMENT5:%.*]] = getelementptr i8, ptr [[K]], i64 8
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[ADD2]], [[SUB2]]
+; CHECK-NEXT:    [[SUB8:%.*]] = add i32 [[ADD7]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[SUB8]], ptr [[ARRAYINIT_ELEMENT5]], align 8
+; CHECK-NEXT:    [[ARRAYINIT_ELEMENT9:%.*]] = getelementptr i8, ptr [[K]], i64 12
+; CHECK-NEXT:    [[ADD13:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[ADD10:%.*]] = add i32 [[ADD13]], [[TMP0]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[ADD10]], [[ADD1]]
+; CHECK-NEXT:    [[ADD12:%.*]] = add i32 [[ADD11]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[ADD12]], ptr [[ARRAYINIT_ELEMENT9]], align 4
+; CHECK-NEXT:    [[CALL15:%.*]] = call i32 (ptr, ...) @printf(ptr null, ptr [[K]])
+; CHECK-NEXT:    ret i32 [[CALL15]]
+;
+entry:
+  %k = alloca [4 x i32], align 16
+  %add1 = add i32 %0, %1
+  %sub2 = add i32 %add1, -1
+  %call = tail call i32 (ptr, ...) @printf(ptr null, i32 %add1)
+  %add2 = add i32 %1, -1
+  %sub3 = add i32 %add2, %call
+  %add4 = add i32 %sub3, %0
+  store i32 %add4, ptr %k, align 16
+  %arrayinit.element = getelementptr i8, ptr %k, i64 4
+  store i32 0, ptr %arrayinit.element, align 4
+  %arrayinit.element5 = getelementptr i8, ptr %k, i64 8
+  %add7 = add i32 %add2, %sub2
+  %sub8 = add i32 %add7, %0
+  store i32 %sub8, ptr %arrayinit.element5, align 8
+  %arrayinit.element9 = getelementptr i8, ptr %k, i64 12
+  %add13 = add i32 %1, 1
+  %add10 = add i32 %add13, %0
+  %add11 = add i32 %add10, %add1
+  %add12 = add i32 %add11, %0
+  store i32 %add12, ptr %arrayinit.element9, align 4
+  %call15 = call i32 (ptr, ...) @printf(ptr null, ptr %k)
+  ret i32 %call15
+}
+
+declare i32 @printf(ptr, ...)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
new file mode 100644
index 0000000..056f33e
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Test the xor with constant operand is decomposed in to gep.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+; Test the gvn pass eliminates the redundant xor instructions from decomposition.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \
+; RUN: -S < %s | FileCheck --check-prefix=GVN %s
+
+; Check that disjoint constants are properly extracted and folded into GEP
+; addressing modes and GVN to eliminate redundant computations
+define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test1(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test1(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT:    [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT:    [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
+; GVN-NEXT:    [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
+; GVN-NEXT:    store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = xor i32 %2, 4128
+  %5 = xor i32 %2, 8224
+  %6 = xor i32 %2, 12320
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
+  %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
+  %11 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %12 = load <8 x half>, ptr addrspace(3) %8, align 16
+  %13 = load <8 x half>, ptr addrspace(3) %9, align 16
+  %14 = load <8 x half>, ptr addrspace(3) %10, align 16
+  %15 = fadd <8 x half> %11, %12
+  %16 = fadd <8 x half> %13, %14
+  %17 = fadd <8 x half> %15, %16
+  store <8 x half> %17, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Check that disjoint constants are properly extracted and folded into GEP
+; addressing modes and GVN to eliminate redundant computations
+define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test2(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test2(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT:    [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT:    [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
+; GVN-NEXT:    [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
+; GVN-NEXT:    store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 12320
+  %4 = xor i32 %2, 8224
+  %5 = xor i32 %2, 4128
+  %6 = xor i32 %2, 32
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
+  %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
+  %11 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %12 = load <8 x half>, ptr addrspace(3) %8, align 16
+  %13 = load <8 x half>, ptr addrspace(3) %9, align 16
+  %14 = load <8 x half>, ptr addrspace(3) %10, align 16
+  %15 = fadd <8 x half> %11, %12
+  %16 = fadd <8 x half> %13, %14
+  %17 = fadd <8 x half> %15, %16
+  store <8 x half> %17, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Verify that xor instructions with different non-disjoint constants are optimized
+define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test3(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test3(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
+; GVN-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; GVN-NEXT:    [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; GVN-NEXT:    [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
+; GVN-NEXT:    store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = xor i32 %2, 2336
+  %5 = xor i32 %2, 4128
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %4
+  %8 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %9 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %10 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %11 = load <8 x half>, ptr addrspace(3) %8, align 16
+  %12 = fadd <8 x half> %9, %10
+  %13 = fadd <8 x half> %11, %12
+  store <8 x half> %13, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Verify that no optimization occurs when disjoint constants are absent
+define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test4(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test4(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; GVN-NEXT:    store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = xor i32 %2, 288
+  %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
+  %7 = load <8 x half>, ptr addrspace(3) %5, align 16
+  %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %9 = fadd <8 x half> %7, %8
+  store <8 x half> %9, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+
+; Verify that XOR-BinOp-GEP usage chains are properly optimized
+define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test5(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 256
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test5(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP5:%.*]] = add i32 [[TMP3]], 256
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT:    store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = xor i32 %2, 4128
+  %5 = add i32 %4, 256
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %10 = fadd <8 x half> %8, %9
+  store <8 x half> %10, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Verify that BinOp-XOR-GEP usage chains are properly optimized.
+; In the below test, make sure we stop processing the chain at xor
+; and not fold the constant from add instruction in to gep. The
+; constant from add can be folded and the future work will cover
+; these cases.
+define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test6(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test6(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; GVN-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], 32
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; GVN-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; GVN-NEXT:    [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; GVN-NEXT:    store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = add i32 %2, 256
+  %5 = xor i32 %4, 4128
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %10 = fadd <8 x half> %8, %9
+  store <8 x half> %10, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as
+; intended.
+define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test6a(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 288
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test6a(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; GVN-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 288
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT:    [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT:    store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = add i32 %2, 256
+  %5 = xor i32 %4, 288
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+  %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+  %10 = fadd <8 x half> %8, %9
+  store <8 x half> %10, ptr addrspace(3) %1, align 16
+  ret void
+}
+
+; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are
+; not extracted
+define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test7(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test7(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT:  [[ENTRY:.*:]]
+; GVN-NEXT:    [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
+; GVN-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT:    [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; GVN-NEXT:    store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT:    ret void
+;
+entry:
+  %2 = select i1 %0, i32 0, i32 288
+  %3 = xor i32 %2, 32
+  %4 = xor i32 %2, 32800
+  %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
+  %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
+  %7 = load <8 x half>, ptr addrspace(3) %5, align 16
+  %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+  %9 = fadd <8 x half> %7, %8
+  store <8 x half> %9, ptr addrspace(3) %1, align 16
+  ret void
+}
+
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index e73f697..3fddd02 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -416,3 +416,40 @@ int llvm_di_type_get_name(void) {
 
   return 0;
 }
+
+int llvm_add_globaldebuginfo(void) {
+  const char *Filename = "debuginfo.c";
+  LLVMModuleRef M = LLVMModuleCreateWithName(Filename);
+  LLVMDIBuilderRef Builder = LLVMCreateDIBuilder(M);
+  LLVMMetadataRef File =
+      LLVMDIBuilderCreateFile(Builder, Filename, strlen(Filename), ".", 1);
+
+  LLVMMetadataRef GlobalVarValueExpr =
+      LLVMDIBuilderCreateConstantValueExpression(Builder, 0);
+  LLVMMetadataRef Int64Ty =
+      LLVMDIBuilderCreateBasicType(Builder, "Int64", 5, 64, 0, LLVMDIFlagZero);
+  LLVMMetadataRef Int64TypeDef = LLVMDIBuilderCreateTypedef(
+      Builder, Int64Ty, "int64_t", 7, File, 42, File, 0);
+
+  LLVMMetadataRef GVE = LLVMDIBuilderCreateGlobalVariableExpression(
+      Builder, File, "global", 6, "", 0, File, 1, Int64TypeDef, true,
+      GlobalVarValueExpr, NULL, 0);
+
+  LLVMTypeRef RecType =
+      LLVMStructCreateNamed(LLVMGetModuleContext(M), "struct");
+  LLVMValueRef Global = LLVMAddGlobal(M, RecType, "global");
+
+  LLVMGlobalAddDebugInfo(Global, GVE);
+  // use AddMetadata to add twice
+  int kindId = LLVMGetMDKindID("dbg", 3);
+  LLVMGlobalAddMetadata(Global, kindId, GVE);
+  size_t numEntries;
+  LLVMValueMetadataEntry *ME = LLVMGlobalCopyAllMetadata(Global, &numEntries);
+  assert(ME != NULL);
+  assert(numEntries == 2);
+
+  LLVMDisposeDIBuilder(Builder);
+  LLVMDisposeModule(M);
+
+  return 0;
+}
diff --git a/llvm/tools/llvm-c-test/llvm-c-test.h b/llvm/tools/llvm-c-test/llvm-c-test.h
index 1da6596c..4c5a88c 100644
--- a/llvm/tools/llvm-c-test/llvm-c-test.h
+++ b/llvm/tools/llvm-c-test/llvm-c-test.h
@@ -45,6 +45,7 @@ int llvm_add_named_metadata_operand(void);
 int llvm_set_metadata(void);
 int llvm_replace_md_operand(void);
 int llvm_is_a_value_as_metadata(void);
+int llvm_add_globaldebuginfo(void);
 
 // object.c
 int llvm_object_list_sections(void);
diff --git a/llvm/tools/llvm-c-test/main.c b/llvm/tools/llvm-c-test/main.c
index badbe4b..d1963b7 100644
--- a/llvm/tools/llvm-c-test/main.c
+++ b/llvm/tools/llvm-c-test/main.c
@@ -101,6 +101,8 @@ int main(int argc, char **argv) {
     return llvm_replace_md_operand();
   } else if (argc == 2 && !strcmp(argv[1], "--is-a-value-as-metadata")) {
     return llvm_is_a_value_as_metadata();
+  } else if (argc == 2 && !strcmp(argv[1], "--add-globaldebuginfo")) {
+    return llvm_add_globaldebuginfo();
   } else if (argc == 2 && !strcmp(argv[1], "--test-function-attributes")) {
     return llvm_test_function_attributes();
   } else if (argc == 2 && !strcmp(argv[1], "--test-callsite-attributes")) {
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 2c9d6dc..1fd0a15 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -598,6 +598,7 @@ struct InstructionInfo {
   std::string HexBytes;
 };
 
+#ifndef NDEBUG
 // Helper function to print generated assembly snippets
 void printInstructions(const std::vector<InstructionInfo> &Instructions,
                        int InitialLinesCount, int LastLinesCount) {
@@ -622,6 +623,7 @@ void printInstructions(const std::vector<InstructionInfo> &Instructions,
            << Instructions[i].HexBytes << Instructions[i].Text << '\n';
   dbgs() << "```\n";
 }
+#endif // NDEBUG
 
 // Function to extract and print assembly from snippet
 Error printAssembledSnippet(const LLVMState &State,
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 4741c7b..acc6a09 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3103,6 +3103,53 @@ TEST(APIntOpsTest, Mulh) {
   EXPECT_EQ(APInt(128, "FFEB498812C66C68D4552DB89B8EBF8F", 16), i128Res);
 }
 
+TEST(APIntOpsTest, muli) {
+  APInt u32a(32, 0x0001'E235);
+  APInt u32b(32, 0xF623'55AD);
+  EXPECT_EQ(0x0001'CFA1'7CA0'76D1, APIntOps::muluExtended(u32a, u32b));
+
+  APInt u64a(64, 0x1234'5678'90AB'CDEF);
+  APInt u64b(64, 0xFEDC'BA09'8765'4321);
+  EXPECT_EQ(APInt(128, "121FA000A3723A57C24A442FE55618CF", 16),
+            APIntOps::muluExtended(u64a, u64b));
+
+  APInt u128a(128, "1234567890ABCDEF1234567890ABCDEF", 16);
+  APInt u128b(128, "FEDCBA0987654321FEDCBA0987654321", 16);
+  EXPECT_EQ(
+      APInt(256,
+            "121FA000A3723A57E68984312C3A8D7E96B428606E1E6BF5C24A442FE55618CF",
+            16),
+      APIntOps::muluExtended(u128a, u128b));
+
+  APInt s32a(32, 0x1234'5678);
+  APInt s32b(32, 0x10AB'CDEF);
+  APInt s32c(32, 0xFEDC'BA09);
+  EXPECT_EQ(0x012F'7D02'2A42'D208, APIntOps::mulsExtended(s32a, s32b));
+  EXPECT_EQ(0xFFEB'4988'09CA'3A38, APIntOps::mulsExtended(s32a, s32c));
+
+  APInt s64a(64, 0x1234'5678'90AB'CDEF);
+  APInt s64b(64, 0x1234'5678'90FE'DCBA);
+  APInt s64c(64, 0xFEDC'BA09'8765'4321);
+  EXPECT_EQ(APInt(128, "014B66DC328E10C1FB99704184EF03A6", 16),
+            APIntOps::mulsExtended(s64a, s64b));
+  EXPECT_EQ(APInt(128, "FFEB498812C66C68C24A442FE55618CF", 16),
+            APIntOps::mulsExtended(s64a, s64c));
+
+  APInt s128a(128, "1234567890ABCDEF1234567890ABCDEF", 16);
+  APInt s128b(128, "1234567890FEDCBA1234567890FEDCBA", 16);
+  APInt s128c(128, "FEDCBA0987654321FEDCBA0987654321", 16);
+  EXPECT_EQ(
+      APInt(256,
+            "014B66DC328E10C1FE303DF9EA0B2529F87E475F3C6C180DFB99704184EF03A6",
+            16),
+      APIntOps::mulsExtended(s128a, s128b));
+  EXPECT_EQ(
+      APInt(256,
+            "FFEB498812C66C68D4552DB89B8EBF8F96B428606E1E6BF5C24A442FE55618CF",
+            16),
+      APIntOps::mulsExtended(s128a, s128c));
+}
+
 TEST(APIntTest, RoundingUDiv) {
   for (uint64_t Ai = 1; Ai <= 255; Ai++) {
     APInt A(8, Ai);
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index bdfbc85..fdecfb7 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -10,6 +10,7 @@
 #include "CountCopyAndMove.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseMapInfoVariant.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
@@ -249,6 +250,25 @@ TYPED_TEST(DenseMapTest, CopyConstructorNotSmallTest) {
     EXPECT_EQ(this->getValue(Key), copyMap[this->getKey(Key)]);
 }
 
+// Test range constructors.
+TYPED_TEST(DenseMapTest, RangeConstructorTest) {
+  using KeyAndValue =
+      std::pair<typename TypeParam::key_type, typename TypeParam::mapped_type>;
+  KeyAndValue PlainArray[] = {{this->getKey(0), this->getValue(0)},
+                              {this->getKey(1), this->getValue(1)}};
+
+  TypeParam MapFromRange(llvm::from_range, PlainArray);
+  EXPECT_EQ(2u, MapFromRange.size());
+  EXPECT_EQ(this->getValue(0), MapFromRange[this->getKey(0)]);
+  EXPECT_EQ(this->getValue(1), MapFromRange[this->getKey(1)]);
+
+  TypeParam MapFromInitList({{this->getKey(0), this->getValue(1)},
+                             {this->getKey(1), this->getValue(2)}});
+  EXPECT_EQ(2u, MapFromInitList.size());
+  EXPECT_EQ(this->getValue(1), MapFromInitList[this->getKey(0)]);
+  EXPECT_EQ(this->getValue(2), MapFromInitList[this->getKey(1)]);
+}
+
 // Test copying from a default-constructed map.
 TYPED_TEST(DenseMapTest, CopyConstructorFromDefaultTest) {
   TypeParam copyMap(this->Map);
@@ -726,6 +746,15 @@ TEST(DenseMapCustomTest, FindAsTest) {
   EXPECT_TRUE(map.find_as("d") == map.end());
 }
 
+TEST(DenseMapCustomTest, SmallDenseMapFromRange) {
+  std::pair<int, StringRef> PlainArray[] = {{0, "0"}, {1, "1"}, {2, "2"}};
+  SmallDenseMap<int, StringRef> M(llvm::from_range, PlainArray);
+  EXPECT_EQ(3u, M.size());
+  using testing::Pair;
+  EXPECT_THAT(M, testing::UnorderedElementsAre(Pair(0, "0"), Pair(1, "1"),
+                                               Pair(2, "2")));
+}
+
 TEST(DenseMapCustomTest, SmallDenseMapInitializerList) {
   SmallDenseMap<int, int> M = {{0, 0}, {0, 1}, {1, 2}};
   EXPECT_EQ(2u, M.size());
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index c036d31b..b3c4087 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include "llvm/Support/ThreadPool.h"
@@ -268,12 +269,13 @@ TEST_P(CASTest, NodesBig) {
     ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
 }
 
+#if LLVM_ENABLE_THREADS
 /// Common test functionality for creating blobs in parallel. You can vary which
 /// cas instances are the same or different, and the size of the created blobs.
 static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2,
                               ObjectStore &Write1, ObjectStore &Write2,
                               uint64_t BlobSize) {
-  SCOPED_TRACE(testBlobsParallel);
+  SCOPED_TRACE("testBlobsParallel");
   unsigned BlobCount = 100;
   std::vector<std::string> Blobs;
   Blobs.reserve(BlobCount);
@@ -325,7 +327,7 @@ static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2,
 }
 
 static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
-  SCOPED_TRACE(testBlobsParallel1);
+  SCOPED_TRACE("testBlobsParallel1");
   testBlobsParallel(CAS, CAS, CAS, CAS, BlobSize);
 }
 
@@ -342,4 +344,5 @@ TEST_P(CASTest, BlobsBigParallel) {
   uint64_t Size = 100ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // LLVM_ENABLE_THREADS
diff --git a/llvm/unittests/ExecutionEngine/Orc/ThreadSafeModuleTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ThreadSafeModuleTest.cpp
index bbb9e8d..7db561c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ThreadSafeModuleTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ThreadSafeModuleTest.cpp
@@ -31,15 +31,21 @@ const llvm::StringRef FooSrc = R"(
   }
 )";
 
-static ThreadSafeModule parseModule(llvm::StringRef Source,
-                                    llvm::StringRef Name) {
-  auto Ctx = std::make_unique<LLVMContext>();
+static std::unique_ptr<Module>
+parseModuleRaw(llvm::StringRef Source, llvm::StringRef Name, LLVMContext &Ctx) {
   SMDiagnostic Err;
-  auto M = parseIR(MemoryBufferRef(Source, Name), Err, *Ctx);
+  auto M = parseIR(MemoryBufferRef(Source, Name), Err, Ctx);
   if (!M) {
     Err.print("Testcase source failed to parse: ", errs());
     exit(1);
   }
+  return M;
+}
+
+static ThreadSafeModule parseModule(llvm::StringRef Source,
+                                    llvm::StringRef Name) {
+  auto Ctx = std::make_unique<LLVMContext>();
+  auto M = parseModuleRaw(Source, Name, *Ctx);
   return ThreadSafeModule(std::move(M), std::move(Ctx));
 }
 
@@ -128,6 +134,20 @@ TEST(ThreadSafeModuleTest, ConsumingModuleDo) {
   TSM.consumingModuleDo([](std::unique_ptr<Module> M) {});
 }
 
+TEST(ThreadSafeModuleTest, CloneExternalModuleToNewContext) {
+  auto Ctx = std::make_unique<LLVMContext>();
+  auto M = parseModuleRaw(FooSrc, "foo.ll", *Ctx);
+  auto TSCtx = ThreadSafeContext(std::make_unique<LLVMContext>());
+  auto TSM = cloneExternalModuleToContext(*M, TSCtx);
+  TSM.withModuleDo([&](Module &NewM) {
+    EXPECT_NE(&NewM.getContext(), Ctx.get());
+    TSCtx.withContextDo(
+        [&](LLVMContext *NewCtx) { EXPECT_EQ(&NewM.getContext(), NewCtx); });
+    EXPECT_FALSE(NewM.empty());
+    EXPECT_FALSE(verifyModule(NewM, &errs()));
+  });
+}
+
 TEST(ThreadSafeModuleTest, CloneToNewContext) {
   auto TSM1 = parseModule(FooSrc, "foo.ll");
   auto TSM2 = cloneToNewContext(TSM1);
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 0065615..03333d5 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -553,17 +553,15 @@ TEST(DIBuilder, FixedPointType) {
   EXPECT_TRUE(Ty->getTag() == dwarf::DW_TAG_base_type);
 }
 
-TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
+TEST(DbgAssignRecordTest, replaceVariableLocationOp) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define dso_local void @fun(i32 %v1, ptr %p1, ptr %p2) !dbg !7 {
     entry:
-      call void @llvm.dbg.assign(metadata i32 %v1, metadata !14, metadata !DIExpression(), metadata !17, metadata ptr %p1, metadata !DIExpression()), !dbg !16
+        #dbg_assign(i32 %v1, !14, !DIExpression(), !17, ptr %p1, !DIExpression(), !16)
       ret void
     }
 
-    declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
     !llvm.dbg.cu = !{!0}
     !llvm.module.flags = !{!3}
 
@@ -629,28 +627,28 @@ TEST(AssignmentTrackingTest, Utils) {
   std::unique_ptr<Module> M = parseIR(C, R"(
     define dso_local void @fun1() !dbg !7 {
     entry:
-      call void @llvm.dbg.assign(metadata i32 undef, metadata !10, metadata !DIExpression(), metadata !12, metadata i32 undef, metadata !DIExpression()), !dbg !13
+        #dbg_assign(i32 undef, !10, !DIExpression(), !12, i32 undef, !DIExpression(), !13)
       %local = alloca i32, align 4, !DIAssignID !12
-      call void @llvm.dbg.assign(metadata i32 undef, metadata !16, metadata !DIExpression(), metadata !12, metadata i32 undef, metadata !DIExpression()), !dbg !15
+        #dbg_assign(i32 undef, !16, !DIExpression(), !12, i32 undef, !DIExpression(), !15)
+        #dbg_assign(i32 undef, !16, !DIExpression(), !25, i32 undef, !DIExpression(), !15)
+        #dbg_assign(i32 undef, !16, !DIExpression(), !25, i32 undef, !DIExpression(), !15)
       ret void, !dbg !15
     }
 
     define dso_local void @fun2() !dbg !17 {
     entry:
       %local = alloca i32, align 4, !DIAssignID !20
-      call void @llvm.dbg.assign(metadata i32 undef, metadata !18, metadata !DIExpression(), metadata !20, metadata i32 undef, metadata !DIExpression()), !dbg !19
+        #dbg_assign(i32 undef, !18, !DIExpression(), !20, i32 undef, !DIExpression(), !19)
       ret void, !dbg !19
     }
 
     define dso_local void @fun3() !dbg !21 {
     entry:
       %local = alloca i32, align 4, !DIAssignID !24
-      call void @llvm.dbg.assign(metadata i32 undef, metadata !22, metadata !DIExpression(), metadata !24, metadata i32* undef, metadata !DIExpression()), !dbg !23
+        #dbg_assign(i32 undef, !22, !DIExpression(), !24, i32* undef, !DIExpression(), !23)
       ret void
     }
 
-    declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
     !llvm.dbg.cu = !{!0}
     !llvm.module.flags = !{!3, !4, !5}
     !llvm.ident = !{!6}
@@ -680,6 +678,7 @@ TEST(AssignmentTrackingTest, Utils) {
     !22 = !DILocalVariable(name: "local4", scope: !21, file: !1, line: 2, type: !11)
     !23 = !DILocation(line: 4, column: 1, scope: !21)
     !24 = distinct !DIAssignID()
+    !25 = distinct !DIAssignID()
     )");
 
   // Check the test IR isn't malformed.
diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
index e90f733..f13252f 100644
--- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
+++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
@@ -78,7 +78,7 @@ TEST(SMEAttributes, Constructors) {
                             "ret void\n}");
   CallBase &Call =
       cast<CallBase>((CallModule->getFunction("foo")->begin()->front()));
-  ASSERT_TRUE(SMECallAttrs(Call, nullptr).callsite().hasUndefZT0());
+  ASSERT_TRUE(SMECallAttrs(Call).callsite().hasUndefZT0());
 
   // Invalid combinations.
   EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible),
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 6c35f60..0fc230c 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -356,7 +356,8 @@ const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
 void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
     raw_ostream &OS) const {
   OS << "void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets("
-        "const llvm::Triple &TT, FloatABI::ABIType FloatABI) {\n"
+        "const llvm::Triple &TT, FloatABI::ABIType FloatABI, EABI EABIVersion, "
+        "StringRef ABIName) {\n"
         "  struct LibcallImplPair {\n"
         "    RTLIB::Libcall Func;\n"
         "    RTLIB::LibcallImpl Impl;\n"
@@ -562,7 +563,9 @@ void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
       auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
       if (!Inserted) {
         PrintError(
-            Def, "combining nested libcall set predicates currently unhandled");
+            Def,
+            "combining nested libcall set predicates currently unhandled: '" +
+                LibcallImpl->getLibcallFuncName() + "'");
       }
 
       It->second.first.push_back(AP.getDef());
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 70141ba..efaf05a 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -53,26 +53,26 @@ constexpr StringLiteral EncodeMacroName = "GIMT_Encode";
 void emitEncodingMacrosDef(raw_ostream &OS) {
   OS << "#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n"
      << "#define " << EncodeMacroName << "2(Val)"
-     << " uint8_t(Val), uint8_t((uint16_t)Val >> 8)\n"
+     << " uint8_t(Val), uint8_t((Val) >> 8)\n"
      << "#define " << EncodeMacroName << "4(Val)"
-     << " uint8_t(Val), uint8_t((uint32_t)Val >> 8), "
-        "uint8_t((uint32_t)Val >> 16), uint8_t((uint32_t)Val >> 24)\n"
+     << " uint8_t(Val), uint8_t((Val) >> 8), "
+        "uint8_t((Val) >> 16), uint8_t((Val) >> 24)\n"
      << "#define " << EncodeMacroName << "8(Val)"
-     << " uint8_t(Val), uint8_t((uint64_t)Val >> 8), "
-        "uint8_t((uint64_t)Val >> 16), uint8_t((uint64_t)Val >> 24),  "
-        "uint8_t((uint64_t)Val >> 32), uint8_t((uint64_t)Val >> 40), "
-        "uint8_t((uint64_t)Val >> 48), uint8_t((uint64_t)Val >> 56)\n"
+     << " uint8_t(Val), uint8_t((Val) >> 8), "
+        "uint8_t((Val) >> 16), uint8_t((Val) >> 24),  "
+        "uint8_t(uint64_t(Val) >> 32), uint8_t(uint64_t(Val) >> 40), "
+        "uint8_t(uint64_t(Val) >> 48), uint8_t(uint64_t(Val) >> 56)\n"
      << "#else\n"
      << "#define " << EncodeMacroName << "2(Val)"
-     << " uint8_t((uint16_t)Val >> 8), uint8_t(Val)\n"
+     << " uint8_t((Val) >> 8), uint8_t(Val)\n"
      << "#define " << EncodeMacroName << "4(Val)"
-     << " uint8_t((uint32_t)Val >> 24), uint8_t((uint32_t)Val >> 16), "
-        "uint8_t((uint32_t)Val >> 8), uint8_t(Val)\n"
+     << " uint8_t((Val) >> 24), uint8_t((Val) >> 16), "
+        "uint8_t((Val) >> 8), uint8_t(Val)\n"
      << "#define " << EncodeMacroName << "8(Val)"
-     << " uint8_t((uint64_t)Val >> 56), uint8_t((uint64_t)Val >> 48), "
-        "uint8_t((uint64_t)Val >> 40), uint8_t((uint64_t)Val >> 32),  "
-        "uint8_t((uint64_t)Val >> 24), uint8_t((uint64_t)Val >> 16), "
-        "uint8_t((uint64_t)Val >> 8), uint8_t(Val)\n"
+     << " uint8_t(uint64_t(Val) >> 56), uint8_t(uint64_t(Val) >> 48), "
+        "uint8_t(uint64_t(Val) >> 40), uint8_t(uint64_t(Val) >> 32),  "
+        "uint8_t((Val) >> 24), uint8_t((Val) >> 16), "
+        "uint8_t((Val) >> 8), uint8_t(Val)\n"
      << "#endif\n";
 }
 
@@ -237,9 +237,12 @@ MatchTableRecord MatchTable::NamedValue(unsigned NumBytes, StringRef Namespace,
 
 MatchTableRecord MatchTable::IntValue(unsigned NumBytes, int64_t IntValue) {
   assert(isUIntN(NumBytes * 8, IntValue) || isIntN(NumBytes * 8, IntValue));
-  auto Str = llvm::to_string(IntValue);
-  if (NumBytes == 1 && IntValue < 0)
-    Str = "uint8_t(" + Str + ")";
+  uint64_t UIntValue = IntValue;
+  if (NumBytes < 8)
+    UIntValue &= (UINT64_C(1) << NumBytes * 8) - 1;
+  std::string Str = llvm::to_string(UIntValue);
+  if (UIntValue > INT64_MAX)
+    Str += 'u';
   // TODO: Could optimize this directly to save the compiler some work when
   // building the file
   return MatchTableRecord(std::nullopt, Str, NumBytes,
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index c5b35ee..b4b9cb0 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -1553,18 +1553,17 @@ void FilterChooser::reportRegion(bitAttr_t RA, unsigned StartBit,
 bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
   Filters.clear();
   BestIndex = -1;
-  unsigned numInstructions = Opcodes.size();
 
-  assert(numInstructions && "Filter created with no instructions");
+  assert(!Opcodes.empty() && "Filter created with no instructions");
 
   // No further filtering is necessary.
-  if (numInstructions == 1)
+  if (Opcodes.size() == 1)
     return true;
 
   // Heuristics.  See also doFilter()'s "Heuristics" comment when num of
   // instructions is 3.
   if (AllowMixed && !Greedy) {
-    assert(numInstructions == 3);
+    assert(Opcodes.size() == 3);
 
     for (const auto &Opcode : Opcodes) {
       insn_t Insn = insnWithID(Opcode.EncodingID);
@@ -1579,8 +1578,6 @@ bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
     }
   }
 
-  unsigned BitIndex;
-
   // We maintain BIT_WIDTH copies of the bitAttrs automaton.
   // The automaton consumes the corresponding bit from each
   // instruction.
@@ -1602,14 +1599,14 @@ bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
 
   // FILTERED bit positions provide no entropy and are not worthy of pursuing.
   // Filter::recurse() set either BIT_TRUE or BIT_FALSE for each position.
-  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex)
+  for (unsigned BitIndex = 0; BitIndex < BitWidth; ++BitIndex)
     if (FilterBitValues[BitIndex].isSet())
       bitAttrs[BitIndex] = ATTR_FILTERED;
 
   for (const auto &OpcPair : Opcodes) {
     insn_t insn = insnWithID(OpcPair.EncodingID);
 
-    for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
+    for (unsigned BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
       switch (bitAttrs[BitIndex]) {
       case ATTR_NONE:
         if (insn[BitIndex] == BitValue::BIT_UNSET)
@@ -1655,7 +1652,7 @@ bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
   bitAttr_t RA = ATTR_NONE;
   unsigned StartBit = 0;
 
-  for (BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
+  for (unsigned BitIndex = 0; BitIndex < BitWidth; ++BitIndex) {
     bitAttr_t bitAttr = bitAttrs[BitIndex];
 
     assert(bitAttr != ATTR_NONE && "Bit without attributes");
@@ -1736,12 +1733,12 @@ bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
   case ATTR_FILTERED:
     break;
   case ATTR_ALL_SET:
-    reportRegion(RA, StartBit, BitIndex, AllowMixed);
+    reportRegion(RA, StartBit, BitWidth, AllowMixed);
     break;
   case ATTR_ALL_UNSET:
     break;
   case ATTR_MIXED:
-    reportRegion(RA, StartBit, BitIndex, AllowMixed);
+    reportRegion(RA, StartBit, BitWidth, AllowMixed);
     break;
   }
 
@@ -1773,8 +1770,7 @@ bool FilterChooser::filterProcessor(bool AllowMixed, bool Greedy) {
 // the instructions.  A conflict of instructions may occur, in which case we
 // dump the conflict set to the standard error.
 void FilterChooser::doFilter() {
-  unsigned Num = Opcodes.size();
-  assert(Num && "FilterChooser created with no instructions");
+  assert(!Opcodes.empty() && "FilterChooser created with no instructions");
 
   // Try regions of consecutive known bit values first.
   if (filterProcessor(false))
@@ -1788,7 +1784,7 @@ void FilterChooser::doFilter() {
   // no single instruction for the maximum ATTR_MIXED region Inst{14-4} has a
   // well-known encoding pattern.  In such case, we backtrack and scan for the
   // the very first consecutive ATTR_ALL_SET region and assign a filter to it.
-  if (Num == 3 && filterProcessor(true, false))
+  if (Opcodes.size() == 3 && filterProcessor(true, false))
     return;
 
   // If we come to here, the instruction decoding has failed.
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 3d11ce5..74f6374 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -201,7 +201,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUTargetObjectFile.cpp",
     "AMDGPUTargetTransformInfo.cpp",
     "AMDGPUUnifyDivergentExitNodes.cpp",
-    "AMDGPUUnifyMetadata.cpp",
     "AMDGPUWaitSGPRHazards.cpp",
     "GCNCreateVOPD.cpp",
     "GCNDPPCombine.cpp",
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index a9414eb..f58a4c6 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -137,6 +137,14 @@ else()
   set(MLIR_ENABLE_ROCM_CONVERSIONS 0)
 endif()
 
+# Build the XeVM conversions and run according tests if the SPIRV backend
+# is available.
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_ENABLE_XEVM_CONVERSIONS 1)
+else()
+  set(MLIR_ENABLE_XEVM_CONVERSIONS 0)
+endif()
+
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
 set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
diff --git a/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h
index b595b6a3..5abfb3d 100644
--- a/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h
+++ b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h
@@ -10,8 +10,11 @@
 
 constexpr const char *alignedAllocFunctionName = "aligned_alloc";
 constexpr const char *mallocFunctionName = "malloc";
+constexpr const char *memcpyFunctionName = "memcpy";
 constexpr const char *cppStandardLibraryHeader = "cstdlib";
 constexpr const char *cStandardLibraryHeader = "stdlib.h";
+constexpr const char *cppStringLibraryHeader = "cstring";
+constexpr const char *cStringLibraryHeader = "string.h";
 
 namespace mlir {
 class DialectRegistry;
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index f7296a7..2058aba 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -52,6 +52,8 @@ def ConvertToLLVMPass : Pass<"convert-to-llvm"> {
                "Test conversion patterns of only the specified dialects">,
     Option<"useDynamic", "dynamic", "bool", "false",
            "Use op conversion attributes to configure the conversion">,
+    Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true",
+           "Experimental performance flag to disallow pattern rollback">
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 90da243..d6761f4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -93,19 +93,22 @@ class ROCDL_IntrPure1Op<string mnemonic> :
 
 class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   list<int> overloadedOperands, list<Trait> traits, int numResults,
-  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0, list<int> immArgPositions = [],
+  int requiresAccessGroup = 0, int requiresAliasAnalysis = 0,
+  int requiresArgAndResultAttrs = 0,
+  list<int> immArgPositions = [],
   list<string> immArgAttrNames = []> :
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, 0, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, requiresArgAndResultAttrs, 0,
+    immArgPositions, immArgAttrNames>;
 
 // Subclass to save typing and ease readibility when there aren't overloaded
 // operands or memory accesses.
 class ROCDL_ConcreteNonMemIntrOp<string mnemonic, list<Trait> traits,
     int numResults, list<int> immArgPositions = [],
     list<string> immArgNames = []>
-  : ROCDL_IntrOp<mnemonic, [], [], traits, numResults, 0, 0,
+  : ROCDL_IntrOp<mnemonic, [], [], traits, numResults, 0, 0, 0,
       immArgPositions, immArgNames>;
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
@@ -148,8 +151,11 @@ class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
 //===----------------------------------------------------------------------===//
 
 class ROCDL_MbcntOp<string mnemonic> :
-    ROCDL_IntrPure1Op<"mbcnt." # mnemonic>,
-  Arguments<(ins I32:$in0, I32:$in1)> {
+    ROCDL_IntrOp<"mbcnt." # mnemonic, [], [], [Pure], 1,
+    0, 0, /*requiresArgAndResultAttrs=*/1> {
+  dag args = (ins I32:$in0, I32:$in1);
+  let arguments = !con(args, baseArgs);
+  let results = (outs I32:$res);
   let assemblyFormat = [{
     $in0 `,` $in1  attr-dict `:` `(` type($in0) `,` type($in1) `)` `->` type($res)
    }];
@@ -515,7 +521,7 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">;
 //===---------------------------------------------------------------------===//
 
 def ROCDL_LoadToLDSOp :
-  ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> {
+  ROCDL_IntrOp<"load.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
   dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
                  Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
                  I32Attr:$size,
@@ -534,7 +540,7 @@ def ROCDL_LoadToLDSOp :
 }
 
 def ROCDL_GlobalLoadLDSOp :
-  ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> {
+  ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
   dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
                  Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
                  I32Attr:$size,
@@ -748,7 +754,7 @@ def ROCDL_RawBufferAtomicUMinOp :
 
 // DPP Update intrinsic
 def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
-    [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0,
+    [AllTypesMatch<["res", "src", "old"]>], 1, 0, 0, 0,
       [2, 3, 4, 5], ["dppCtrl", "rowMask", "bankMask", "boundCtrl"]>,
   Arguments<(ins LLVM_Type:$old, LLVM_Type:$src, I32Attr:$dppCtrl, I32Attr:$rowMask,
       I32Attr:$bankMask, I1Attr:$boundCtrl)> {
@@ -760,7 +766,7 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
 
 // PermLaneX16 intrinsic operation
 def ROCDL_PermlaneX16Op : ROCDL_IntrOp<"permlanex16", [], [0],
-    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0,
+    [AllTypesMatch<["res", "old", "src0"]>, AllTypesMatch<["src1", "src2"]>], 1, 0, 0, 0,
     [4, 5], ["fi", "boundControl"]>,
   Arguments<(ins LLVM_Type:$old, LLVM_Type:$src0, LLVM_Type:$src1, LLVM_Type:$src2,
              I1Attr:$fi, I1Attr:$boundControl)> {
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 1e5b5d4..8d5306d 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1690,7 +1690,7 @@ struct DecomposeOuterUnitDimsPackOpPattern
 /// Rewrites a linalg::UnPackOp into a sequence of rank-reduced
 ///   * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp
 ///
-/// Requires that all the outer dims of the input linalg::PackOp are 1.
+/// Requires that all the tiled outer dims of the input linalg::PackOp are 1.
 ///
 /// Before:
 /// ```
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 30c1d97..c7b8367 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2058,39 +2058,52 @@ def Vector_GatherOp :
     Results<(outs AnyVectorOfNonZeroRank:$result)> {
 
   let summary = [{
-    gathers elements from memory or ranked tensor into a vector as defined by an
-    index vector and a mask vector
+    Gathers elements from memory or ranked tensor into a vector as defined by an
+    index vector and a mask vector.
   }];
 
   let description = [{
     The gather operation returns an n-D vector whose elements are either loaded
-    from memory or ranked tensor, or taken from a pass-through vector, depending
+    from a k-D memref or tensor, or taken from an n-D pass-through vector, depending
     on the values of an n-D mask vector.
-    If a mask bit is set, the corresponding result element is defined by the base
-    with indices and the n-D index vector (each index is a 1-D offset on the base).
-    Otherwise, the corresponding element is taken from the n-D pass-through vector.
-    Informally the semantics are:
+
+    If a mask bit is set, the corresponding result element is taken from `base`
+    at an index defined by k indices and n-D `index_vec`. Otherwise, the element
+    is taken from the pass-through vector. As an example, suppose that `base` is
+    3-D and the result is 2-D:
+
+    ```mlir
+    func.func @gather_3D_to_2D(
+        %base: memref<?x10x?xf32>, %i0: index, %i1: index, %i2: index,
+        %index_vec: vector<2x3xi32>, %mask: vector<2x3xi1>,
+        %fall_thru: vector<2x3xf32>) -> vector<2x3xf32> {
+            %result = vector.gather %base[%i0, %i1, %i2]
+                                   [%index_vec], %mask, %fall_thru : [...]
+            return %result : vector<2x3xf32>
+    }
     ```
-    result[0] := if mask[0] then base[index[0]] else pass_thru[0]
-    result[1] := if mask[1] then base[index[1]] else pass_thru[1]
-    etc.
+
+    The indexing semantics are then,
+
+    ```
+    result[i,j] := if mask[i,j] then base[i0, i1, i2 + index_vec[i,j]]
+                   else pass_thru[i,j]
     ```
+    The index into `base` only varies in the innermost ((k-1)-th) dimension.
 
     If a mask bit is set and the corresponding index is out-of-bounds for the
     given base, the behavior is undefined. If a mask bit is not set, the value
     comes from the pass-through vector regardless of the index, and the index is
     allowed to be out-of-bounds.
 
-    The gather operation can be used directly where applicable, or can be used
-    during progressively lowering to bring other memory operations closer to
-    hardware ISA support for a gather.
-
     Examples:
 
     ```mlir
+    // 1-D memref gathered to 2-D vector.
     %0 = vector.gather %base[%c0][%v], %mask, %pass_thru
        : memref<?xf32>, vector<2x16xi32>, vector<2x16xi1>, vector<2x16xf32> into vector<2x16xf32>
 
+    // 2-D memref gathered to 1-D vector.
     %1 = vector.gather %base[%i, %j][%v], %mask, %pass_thru
        : memref<16x16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
     ```
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Target.h b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
new file mode 100644
index 0000000..6aab15c
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Target.h
@@ -0,0 +1,30 @@
+//===-- Target.h - MLIR XeVM target registration ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides registration calls for attaching the XeVM target interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_XEVM_TARGET_H
+#define MLIR_TARGET_LLVM_XEVM_TARGET_H
+
+namespace mlir {
+class DialectRegistry;
+class MLIRContext;
+namespace xevm {
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the given registry.
+void registerXeVMTargetInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
+/// the registry associated with the given context.
+void registerXeVMTargetInterfaceExternalModels(mlir::MLIRContext &context);
+} // namespace xevm
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_XEVM_TARGET_H
diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
new file mode 100644
index 0000000..5d523f1
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h
@@ -0,0 +1,63 @@
+//===-- Utils.h - MLIR XeVM target utils ------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files declares XeVM target related utility classes and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_XEVM_UTILS_H
+#define MLIR_TARGET_LLVM_XEVM_UTILS_H
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Target/LLVM/ModuleToObject.h"
+
+namespace mlir {
+namespace xevm {
+
+/// Base class for all XeVM serializations from GPU modules into binary strings.
+/// By default this class serializes into LLVM bitcode.
+class SerializeGPUModuleBase : public LLVM::ModuleToObject {
+public:
+  SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target,
+                         const gpu::TargetOptions &targetOptions = {});
+
+  /// Returns the target attribute.
+  XeVMTargetAttr getTarget() const;
+
+  /// Loads the bitcode files in `librariesToLink`.
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::Module &module) override;
+
+  /// Returns the gpu module being serialized.
+  gpu::GPUModuleOp getGPUModuleOp();
+
+  /// Compiles to native code using `ocloc`.
+  std::optional<SmallVector<char, 0>> compileToBinary(const std::string &asmStr,
+                                                      StringRef inputFormat);
+
+protected:
+  /// XeVM Target attribute.
+  XeVMTargetAttr xeTarget;
+  /// List of LLVM bitcode to link into after translation to LLVM IR.
+  /// The attributes can be StringAttr pointing to a file path, or
+  /// a Resource blob pointing to the LLVM bitcode in-memory.
+  SmallVector<Attribute> librariesToLink;
+
+  /// Returns the path to the tool used for serialization.
+  std::optional<std::string> findTool(StringRef tool);
+
+  /// GPU compilation target options.
+  gpu::TargetOptions targetOptions;
+};
+} // namespace xevm
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_XEVM_UTILS_H
diff --git a/mlir/include/mlir/Target/SPIRV/Serialization.h b/mlir/include/mlir/Target/SPIRV/Serialization.h
index bc58093..e474101 100644
--- a/mlir/include/mlir/Target/SPIRV/Serialization.h
+++ b/mlir/include/mlir/Target/SPIRV/Serialization.h
@@ -38,14 +38,14 @@ struct SerializationOptions {
   /// or an absolute path followed by the prefix. For example:
   ///
   ///   * "foo" - Create files with a `foo` prefix in the current working
-  ///     directory. For example: `fooXYZ123`, `fooABC456` ... `fooXXXXXX`.
-  ///     The last 6 characters will be a unique combination as
-  ///     generated by `llvm::sys::fs::createUniqueFile`.
+  ///     directory. For example: `fooXYZ123.spv`, `fooABC456.spv` ...
+  ///     `fooXXXXXX.spv`. The last 6 characters will be a unique combination
+  ///     as generated by `llvm::sys::fs::createUniqueFile`.
   ///
   ///   * "my/dir/foo" - Create files in `my/dir` with a `foo` prefix. The
-  ///     `my/dir` need to exists. For example: `fooXYZ123`, `fooABC456` ...
-  ///     `fooXXXXXX` will be created and stored in `/my/dir`. Filenames
-  ///     follow the same pattern as above.
+  ///     `my/dir` need to exists. For example: `fooXYZ123.spv`,
+  ///     `fooABC456.spv` ... `fooXXXXXX.spv` will be created and stored in
+  ///     `/my/dir`. Filenames follow the same pattern as above.
   ///
   ///   * "/home/user/my/dir" - Same as above but using an absolute path.
   std::string validationFilePrefix = "";
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index ea82848..e601c82 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1241,16 +1241,17 @@ struct ConversionConfig {
   /// 2. Pattern produces IR (in-place modification or new IR) that is illegal
   ///    and cannot be legalized by subsequent foldings / pattern applications.
   ///
-  /// If set to "false", the conversion driver will produce an LLVM fatal error
-  /// instead of rolling back IR modifications. Moreover, in case of a failed
-  /// conversion, the original IR is not restored. The resulting IR may be a
-  /// mix of original and rewritten IR. (Same as a failed greedy pattern
-  /// rewrite.)
+  /// Experimental: If set to "false", the conversion driver will produce an
+  /// LLVM fatal error instead of rolling back IR modifications. Moreover, in
+  /// case of a failed conversion, the original IR is not restored. The
+  /// resulting IR may be a mix of original and rewritten IR. (Same as a failed
+  /// greedy pattern rewrite.) Use the cmake build option
+  /// `-DMLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS=ON` (ideally together with
+  /// ASAN) to detect invalid pattern API usage.
   ///
-  /// Note: This flag was added in preparation of the One-Shot Dialect
-  /// Conversion refactoring, which will remove the ability to roll back IR
-  /// modifications from the conversion driver. Use this flag to ensure that
-  /// your patterns do not trigger any IR rollbacks. For details, see
+  /// When pattern rollback is disabled, the conversion driver has to maintain
+  /// less internal state. This is more efficient, but not supported by all
+  /// lowering patterns. For details, see
   /// https://discourse.llvm.org/t/rfc-a-new-one-shot-dialect-conversion-driver/79083.
   bool allowPatternRollback = true;
 
diff --git a/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp b/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
index ed5d6d4..cdb7150 100644
--- a/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
+++ b/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
@@ -31,7 +31,8 @@ namespace {
 class ConvertToLLVMPassInterface {
 public:
   ConvertToLLVMPassInterface(MLIRContext *context,
-                             ArrayRef<std::string> filterDialects);
+                             ArrayRef<std::string> filterDialects,
+                             bool allowPatternRollback = true);
   virtual ~ConvertToLLVMPassInterface() = default;
 
   /// Get the dependent dialects used by `convert-to-llvm`.
@@ -60,6 +61,9 @@ protected:
   MLIRContext *context;
   /// List of dialects names to use as filters.
   ArrayRef<std::string> filterDialects;
+  /// An experimental flag to disallow pattern rollback. This is more efficient
+  /// but not supported by all lowering patterns.
+  bool allowPatternRollback;
 };
 
 /// This DialectExtension can be attached to the context, which will invoke the
@@ -128,7 +132,9 @@ struct StaticConvertToLLVM : public ConvertToLLVMPassInterface {
 
   /// Apply the conversion driver.
   LogicalResult transform(Operation *op, AnalysisManager manager) const final {
-    if (failed(applyPartialConversion(op, *target, *patterns)))
+    ConversionConfig config;
+    config.allowPatternRollback = allowPatternRollback;
+    if (failed(applyPartialConversion(op, *target, *patterns, config)))
       return failure();
     return success();
   }
@@ -179,7 +185,9 @@ struct DynamicConvertToLLVM : public ConvertToLLVMPassInterface {
                                               patterns);
 
     // Apply the conversion.
-    if (failed(applyPartialConversion(op, target, std::move(patterns))))
+    ConversionConfig config;
+    config.allowPatternRollback = allowPatternRollback;
+    if (failed(applyPartialConversion(op, target, std::move(patterns), config)))
       return failure();
     return success();
   }
@@ -206,9 +214,11 @@ public:
     std::shared_ptr<ConvertToLLVMPassInterface> impl;
     // Choose the pass implementation.
     if (useDynamic)
-      impl = std::make_shared<DynamicConvertToLLVM>(context, filterDialects);
+      impl = std::make_shared<DynamicConvertToLLVM>(context, filterDialects,
+                                                    allowPatternRollback);
     else
-      impl = std::make_shared<StaticConvertToLLVM>(context, filterDialects);
+      impl = std::make_shared<StaticConvertToLLVM>(context, filterDialects,
+                                                   allowPatternRollback);
     if (failed(impl->initialize()))
       return failure();
     this->impl = impl;
@@ -228,8 +238,10 @@ public:
 //===----------------------------------------------------------------------===//
 
 ConvertToLLVMPassInterface::ConvertToLLVMPassInterface(
-    MLIRContext *context, ArrayRef<std::string> filterDialects)
-    : context(context), filterDialects(filterDialects) {}
+    MLIRContext *context, ArrayRef<std::string> filterDialects,
+    bool allowPatternRollback)
+    : context(context), filterDialects(filterDialects),
+      allowPatternRollback(allowPatternRollback) {}
 
 void ConvertToLLVMPassInterface::getDependentDialects(
     DialectRegistry &registry) {
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d22364e..e6fbcf9 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -79,17 +79,30 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
   return canBeBare;
 }
 
-static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
-                       const unsigned indexBitwidth) {
+static Value getLaneId(RewriterBase &rewriter, Location loc) {
   auto int32Type = IntegerType::get(rewriter.getContext(), 32);
   Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
   Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
-  Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, int32Type,
-                                           ValueRange{minus1, zero});
-  Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, int32Type,
-                                          ValueRange{minus1, mbcntLo});
+  NamedAttribute noundef = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getNoUndefAttrName(), rewriter.getUnitAttr());
+  NamedAttribute lowRange = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getRangeAttrName(),
+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
+                                   APInt(32, 32)));
+  NamedAttribute highRange = rewriter.getNamedAttr(
+      LLVM::LLVMDialect::getRangeAttrName(),
+      LLVM::ConstantRangeAttr::get(rewriter.getContext(), APInt::getZero(32),
+                                   APInt(32, 64)));
+  Value mbcntLo = ROCDL::MbcntLoOp::create(
+      rewriter, loc, int32Type, minus1, zero, /*arg_attrs=*/{},
+      /*res_attrs=*/
+      rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, lowRange})));
+  Value laneId = ROCDL::MbcntHiOp::create(
+      rewriter, loc, int32Type, minus1, mbcntLo, /*arg_attrs=*/{},
+      rewriter.getArrayAttr(rewriter.getDictionaryAttr({noundef, highRange})));
   return laneId;
 }
+
 static constexpr StringLiteral amdgcnDataLayout =
     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
     "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
@@ -104,18 +117,16 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   LogicalResult
   matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
+    Location loc = op.getLoc();
     MLIRContext *context = rewriter.getContext();
-    // convert to:  %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
-    // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
-
-    Type intTy = IntegerType::get(context, 32);
-    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
-    Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
-    Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, intTy,
-                                             ValueRange{minus1, zero});
-    Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, intTy,
-                                            ValueRange{minus1, mbcntLo});
+    // convert to:
+    //   %mlo = call noundef range(i32 0, 32)
+    //     @llvm.amdgcn.mbcnt.lo(-1, 0)
+    // followed by:
+    //   %lid = call noundef range(i32 0, 64)
+    //     @llvm.amdgcn.mbcnt.hi(-1, %mlo)
+
+    Value laneId = getLaneId(rewriter, loc);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
@@ -185,8 +196,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     Location loc = op->getLoc();
     Value initShflValue = adaptor.getValue();
 
-    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
-    Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
+    Value srcLaneId = getLaneId(rewriter, loc);
 
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
     Value width = adaptor.getWidth();
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
index 6bd0e2d..a1f38c9 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -17,11 +17,13 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <cstdint>
+#include <numeric>
 
 using namespace mlir;
 
@@ -97,6 +99,48 @@ Type convertMemRefType(MemRefType opTy, const TypeConverter *typeConverter) {
   return resultTy;
 }
 
+static Value calculateMemrefTotalSizeBytes(Location loc, MemRefType memrefType,
+                                           OpBuilder &builder) {
+  assert(isMemRefTypeLegalForEmitC(memrefType) &&
+         "incompatible memref type for EmitC conversion");
+  emitc::CallOpaqueOp elementSize = emitc::CallOpaqueOp::create(
+      builder, loc, emitc::SizeTType::get(builder.getContext()),
+      builder.getStringAttr("sizeof"), ValueRange{},
+      ArrayAttr::get(builder.getContext(),
+                     {TypeAttr::get(memrefType.getElementType())}));
+
+  IndexType indexType = builder.getIndexType();
+  int64_t numElements = std::accumulate(memrefType.getShape().begin(),
+                                        memrefType.getShape().end(), int64_t{1},
+                                        std::multiplies<int64_t>());
+  emitc::ConstantOp numElementsValue = emitc::ConstantOp::create(
+      builder, loc, indexType, builder.getIndexAttr(numElements));
+
+  Type sizeTType = emitc::SizeTType::get(builder.getContext());
+  emitc::MulOp totalSizeBytes = emitc::MulOp::create(
+      builder, loc, sizeTType, elementSize.getResult(0), numElementsValue);
+
+  return totalSizeBytes.getResult();
+}
+
+static emitc::ApplyOp
+createPointerFromEmitcArray(Location loc, OpBuilder &builder,
+                            TypedValue<emitc::ArrayType> arrayValue) {
+
+  emitc::ConstantOp zeroIndex = emitc::ConstantOp::create(
+      builder, loc, builder.getIndexType(), builder.getIndexAttr(0));
+
+  emitc::ArrayType arrayType = arrayValue.getType();
+  llvm::SmallVector<mlir::Value> indices(arrayType.getRank(), zeroIndex);
+  emitc::SubscriptOp subPtr =
+      emitc::SubscriptOp::create(builder, loc, arrayValue, ValueRange(indices));
+  emitc::ApplyOp ptr = emitc::ApplyOp::create(
+      builder, loc, emitc::PointerType::get(arrayType.getElementType()),
+      builder.getStringAttr("&"), subPtr);
+
+  return ptr;
+}
+
 struct ConvertAlloc final : public OpConversionPattern<memref::AllocOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -159,6 +203,47 @@ struct ConvertAlloc final : public OpConversionPattern<memref::AllocOp> {
   }
 };
 
+struct ConvertCopy final : public OpConversionPattern<memref::CopyOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::CopyOp copyOp, OpAdaptor operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = copyOp.getLoc();
+    MemRefType srcMemrefType = cast<MemRefType>(copyOp.getSource().getType());
+    MemRefType targetMemrefType =
+        cast<MemRefType>(copyOp.getTarget().getType());
+
+    if (!isMemRefTypeLegalForEmitC(srcMemrefType))
+      return rewriter.notifyMatchFailure(
+          loc, "incompatible source memref type for EmitC conversion");
+
+    if (!isMemRefTypeLegalForEmitC(targetMemrefType))
+      return rewriter.notifyMatchFailure(
+          loc, "incompatible target memref type for EmitC conversion");
+
+    auto srcArrayValue =
+        cast<TypedValue<emitc::ArrayType>>(operands.getSource());
+    emitc::ApplyOp srcPtr =
+        createPointerFromEmitcArray(loc, rewriter, srcArrayValue);
+
+    auto targetArrayValue =
+        cast<TypedValue<emitc::ArrayType>>(operands.getTarget());
+    emitc::ApplyOp targetPtr =
+        createPointerFromEmitcArray(loc, rewriter, targetArrayValue);
+
+    emitc::CallOpaqueOp memCpyCall = emitc::CallOpaqueOp::create(
+        rewriter, loc, TypeRange{}, "memcpy",
+        ValueRange{
+            targetPtr.getResult(), srcPtr.getResult(),
+            calculateMemrefTotalSizeBytes(loc, srcMemrefType, rewriter)});
+
+    rewriter.replaceOp(copyOp, memCpyCall.getResults());
+
+    return success();
+  }
+};
+
 struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -320,6 +405,7 @@ void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) {
 
 void mlir::populateMemRefToEmitCConversionPatterns(
     RewritePatternSet &patterns, const TypeConverter &converter) {
-  patterns.add<ConvertAlloca, ConvertAlloc, ConvertGlobal, ConvertGetGlobal,
-               ConvertLoad, ConvertStore>(converter, patterns.getContext());
+  patterns.add<ConvertAlloca, ConvertAlloc, ConvertCopy, ConvertGlobal,
+               ConvertGetGlobal, ConvertLoad, ConvertStore>(
+      converter, patterns.getContext());
 }
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp
index e78dd76..a518902 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp
@@ -18,6 +18,8 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTMEMREFTOEMITC
@@ -27,6 +29,15 @@ namespace mlir {
 using namespace mlir;
 
 namespace {
+
+emitc::IncludeOp addStandardHeader(OpBuilder &builder, ModuleOp module,
+                                   StringRef headerName) {
+  StringAttr includeAttr = builder.getStringAttr(headerName);
+  return builder.create<emitc::IncludeOp>(
+      module.getLoc(), includeAttr,
+      /*is_standard_include=*/builder.getUnitAttr());
+}
+
 struct ConvertMemRefToEmitCPass
     : public impl::ConvertMemRefToEmitCBase<ConvertMemRefToEmitCPass> {
   using Base::Base;
@@ -55,34 +66,29 @@ struct ConvertMemRefToEmitCPass
       return signalPassFailure();
 
     mlir::ModuleOp module = getOperation();
+    llvm::SmallSet<StringRef, 4> existingHeaders;
+    mlir::OpBuilder builder(module.getBody(), module.getBody()->begin());
+    module.walk([&](mlir::emitc::IncludeOp includeOp) {
+      if (includeOp.getIsStandardInclude())
+        existingHeaders.insert(includeOp.getInclude());
+    });
+
     module.walk([&](mlir::emitc::CallOpaqueOp callOp) {
-      if (callOp.getCallee() != alignedAllocFunctionName &&
-          callOp.getCallee() != mallocFunctionName) {
+      StringRef expectedHeader;
+      if (callOp.getCallee() == alignedAllocFunctionName ||
+          callOp.getCallee() == mallocFunctionName)
+        expectedHeader = options.lowerToCpp ? cppStandardLibraryHeader
+                                            : cStandardLibraryHeader;
+      else if (callOp.getCallee() == memcpyFunctionName)
+        expectedHeader =
+            options.lowerToCpp ? cppStringLibraryHeader : cStringLibraryHeader;
+      else
         return mlir::WalkResult::advance();
+      if (!existingHeaders.contains(expectedHeader)) {
+        addStandardHeader(builder, module, expectedHeader);
+        existingHeaders.insert(expectedHeader);
       }
-
-      for (auto &op : *module.getBody()) {
-        emitc::IncludeOp includeOp = llvm::dyn_cast<mlir::emitc::IncludeOp>(op);
-        if (!includeOp) {
-          continue;
-        }
-        if (includeOp.getIsStandardInclude() &&
-            ((options.lowerToCpp &&
-              includeOp.getInclude() == cppStandardLibraryHeader) ||
-             (!options.lowerToCpp &&
-              includeOp.getInclude() == cStandardLibraryHeader))) {
-          return mlir::WalkResult::interrupt();
-        }
-      }
-
-      mlir::OpBuilder builder(module.getBody(), module.getBody()->begin());
-      StringAttr includeAttr =
-          builder.getStringAttr(options.lowerToCpp ? cppStandardLibraryHeader
-                                                   : cStandardLibraryHeader);
-      builder.create<mlir::emitc::IncludeOp>(
-          module.getLoc(), includeAttr,
-          /*is_standard_include=*/builder.getUnitAttr());
-      return mlir::WalkResult::interrupt();
+      return mlir::WalkResult::advance();
     });
   }
 };
diff --git a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
index e9cf493..6da76e9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
 #include "llvm/Support/Regex.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 9d7fb18..db4edfe 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -5767,11 +5767,18 @@ ArrayRef<int64_t> UnPackOp::getAllOuterDims() {
 
 SmallVector<int64_t> UnPackOp::getTiledOuterDims() {
   auto innerDimsPos = getInnerDimsPos();
-  auto packedShape = getSourceType().getShape();
+  SmallVector<int64_t> outerDims(getAllOuterDims());
   SmallVector<int64_t> res;
 
+  // Recover the original order of the outer dims.
+  SmallVector<int64_t> outerDimPermInv(getOuterDimsPerm());
+  invertPermutationVector(outerDimPermInv);
+  if (!outerDimPermInv.empty())
+    applyPermutationToVector(outerDims, outerDimPermInv);
+
+  // Collect the outer dims corresponding to the tilled inner dims.
   for (auto index : innerDimsPos)
-    res.push_back(packedShape[index]);
+    res.push_back(outerDims[index]);
 
   return res;
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 80fbe3c..406f05c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2609,6 +2609,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
                  isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
                  isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) ||
+                 isa<linalg::BatchMmt4DOp>(op) ||
                  hasReductionIterator(linalgOp));
 }
 
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 34c95e3..8474244 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -422,6 +422,12 @@ std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
                            << descMemref << " != " << dstMemref;
   }
 
+  int lastDimBytes =
+      descMemref.getShape().back() * descMemref.getElementTypeBitWidth() / 8;
+  if (lastDimBytes % 16 != 0) {
+    return op->emitError() << "the bytes in the last dimension of the tensor "
+                              "map must be a multiple of 16";
+  }
   return std::nullopt;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 134aef3..0e88d31d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -730,9 +730,9 @@ public:
                    {tensor, lvlCoords, values, filled, added, count},
                    EmitCInterface::On);
     Operation *parent = getTop(op);
+    rewriter.setInsertionPointAfter(parent);
     rewriter.replaceOp(op, adaptor.getTensor());
     // Deallocate the buffers on exit of the loop nest.
-    rewriter.setInsertionPointAfter(parent);
     memref::DeallocOp::create(rewriter, loc, values);
     memref::DeallocOp::create(rewriter, loc, filled);
     memref::DeallocOp::create(rewriter, loc, added);
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index e3cba388..fce61f2 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1120,13 +1120,14 @@ OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
   }
 
   if (rhsTy == resultTy) {
-    if (isSplatZero(resultETy, lhsAttr))
+    if (isSplatZero(resultETy, lhsAttr) && resultTy.hasStaticShape())
+      // constant values can only be resized if resulting type is static
       return lhsAttr.resizeSplat(resultTy);
     if (isSplatOne(resultETy, lhsAttr, shift))
       return rhs;
   }
   if (lhsTy == resultTy) {
-    if (isSplatZero(resultETy, rhsAttr))
+    if (isSplatZero(resultETy, rhsAttr) && resultTy.hasStaticShape())
       return rhsAttr.resizeSplat(resultTy);
     if (isSplatOne(resultETy, rhsAttr, shift))
       return lhs;
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index e6ef028..34385d7 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -276,7 +276,7 @@ std::optional<int64_t> constantTripCount(OpFoldResult lb, OpFoldResult ub,
   if (!ubConstant)
     return std::nullopt;
   std::optional<int64_t> stepConstant = getConstantIntValue(step);
-  if (!stepConstant)
+  if (!stepConstant || *stepConstant == 0)
     return std::nullopt;
 
   return llvm::divideCeilSigned(*ubConstant - *lbConstant, *stepConstant);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 97c97ac3..270d71a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -647,17 +647,55 @@ struct UnrealizedConversionCastOpPattern
   }
 };
 
+// This pattern distributes arith.constant op into subgroup-level constants
+struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
+  using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto vecAttr = dyn_cast<DenseElementsAttr>(op.getValue());
+    auto vecType = dyn_cast<VectorType>(op.getType());
+    if (!vecAttr || !vecAttr.isSplat() || !vecType)
+      return failure();
+
+    xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op.getResult());
+    if (!layout || !layout.getSgLayout())
+      return failure();
+
+    ArrayRef<int64_t> wgShape = vecType.getShape();
+    SmallVector<int64_t> sgShape;
+    int count;
+    std::tie(sgShape, count) = getSgShapeAndCount(wgShape, layout);
+
+    // Current limitation: constant of vector with single value.
+    // TODO: support more complex cases, e.g., vector with multiple values.
+    Attribute singleVal = vecAttr.getSplatValue<Attribute>();
+
+    auto newType = VectorType::get(sgShape, vecType.getElementType());
+    auto sgAttr = DenseElementsAttr::get(newType, singleVal);
+    auto cstOp =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), newType, sgAttr);
+    if (auto newLayout = layout.dropSgLayoutAndData())
+      xegpu::setLayoutAttr(cstOp->getResult(0), newLayout);
+    SmallVector<Value> newConsts(count, cstOp);
+
+    rewriter.replaceOpWithMultiple(op, {newConsts});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
 namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
-  patterns
-      .add<WgToSgCreateNdOp, WgToSgCreateNdOpNoOffset, WgToSgLoadNdOp,
-           WgToSgStoreNdOp, WgToSgUpdateNdOffsetOp, WgToSgDpasOp,
-           WgToSgPrefetchNdOp, UnrealizedConversionCastOpPattern,
-           WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp>(
-          patterns.getContext());
+  patterns.add<WgToSgCreateNdOp, WgToSgCreateNdOpNoOffset, WgToSgLoadNdOp,
+               WgToSgStoreNdOp, WgToSgUpdateNdOffsetOp, WgToSgDpasOp,
+               WgToSgPrefetchNdOp, UnrealizedConversionCastOpPattern,
+               WgToSgElementwiseOp, WgToSgVectorBroadcastOp,
+               WgToSgConvertLayoutOp, WgToSgArithConstantOp>(
+      patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -769,6 +807,14 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(xegpu::getLayoutAttr(op.getResult()));
       });
 
+  target.addDynamicallyLegalOp<arith::ConstantOp>(
+      [=](arith::ConstantOp op) -> bool {
+        auto vecType = dyn_cast<VectorType>(op.getType());
+        if (!vecType)
+          return true;
+        return isLegal(xegpu::getLayoutAttr(op.getResult()));
+      });
+
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp
index 950b85e2..258fed1 100644
--- a/mlir/lib/RegisterAllDialects.cpp
+++ b/mlir/lib/RegisterAllDialects.cpp
@@ -102,6 +102,7 @@
 #include "mlir/Interfaces/CastInterfaces.h"
 #include "mlir/Target/LLVM/NVVM/Target.h"
 #include "mlir/Target/LLVM/ROCDL/Target.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
 #include "mlir/Target/SPIRV/Target.h"
 
 /// Add all the MLIR dialects to the provided registry.
@@ -199,6 +200,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
   NVVM::registerNVVMTargetInterfaceExternalModels(registry);
   ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
   spirv::registerSPIRVTargetInterfaceExternalModels(registry);
+  xevm::registerXeVMTargetInterfaceExternalModels(registry);
 }
 
 /// Append all the MLIR dialects to the registry contained in the given context.
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index 8f7c67c..232ddaf 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -58,6 +58,7 @@
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
 
 /// This function may be called to register all MLIR dialect extensions with the
 /// provided registry.
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index f6e44c6..9a0e4d4 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -210,3 +210,27 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
   )
 endif()
 
+if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(SPIRV_LIBS
+    SPIRVCodeGen
+    SPIRVDesc
+    SPIRVInfo
+  )
+endif()
+
+add_mlir_dialect_library(MLIRXeVMTarget
+  XeVM/Target.cpp
+
+  OBJECT
+
+  LINK_COMPONENTS
+  ${SPIRV_LIBS}
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRExecutionEngineUtils
+  MLIRSupport
+  MLIRGPUDialect
+  MLIRTargetLLVM
+  MLIRXeVMToLLVMIRTranslation
+)
diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp
new file mode 100644
index 0000000..1e6784a2
--- /dev/null
+++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp
@@ -0,0 +1,418 @@
+//===- Target.cpp - MLIR LLVM XeVM target compilation -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files defines XeVM target related functions including registration
+// calls for the `#xevm.target` compilation attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/XeVM/Target.h"
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
+#include "mlir/Target/LLVM/XeVM/Utils.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Config/Targets.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+using namespace mlir;
+using namespace mlir::xevm;
+
+namespace {
+// XeVM implementation of the gpu:TargetAttrInterface.
+class XeVMTargetAttrImpl
+    : public gpu::TargetAttrInterface::FallbackModel<XeVMTargetAttrImpl> {
+public:
+  std::optional<SmallVector<char, 0>>
+  serializeToObject(Attribute attribute, Operation *module,
+                    const gpu::TargetOptions &options) const;
+
+  Attribute createObject(Attribute attribute, Operation *module,
+                         const SmallVector<char, 0> &object,
+                         const gpu::TargetOptions &options) const;
+};
+} // namespace
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
+    XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
+  });
+}
+
+void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
+    MLIRContext &context) {
+  DialectRegistry registry;
+  registerXeVMTargetInterfaceExternalModels(registry);
+  context.appendDialectRegistry(registry);
+}
+
+SerializeGPUModuleBase::SerializeGPUModuleBase(
+    Operation &module, XeVMTargetAttr xeTarget,
+    const gpu::TargetOptions &targetOptions)
+    : ModuleToObject(module, xeTarget.getTriple(), "", {}, xeTarget.getO()),
+      xeTarget(xeTarget), librariesToLink(targetOptions.getLibrariesToLink()),
+      targetOptions(targetOptions) {
+  if (xeTarget.getLinkFiles())
+    librariesToLink.append(xeTarget.getLinkFiles().begin(),
+                           xeTarget.getLinkFiles().end());
+}
+
+XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return xeTarget; }
+
+std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
+  if (librariesToLink.empty())
+    return SmallVector<std::unique_ptr<llvm::Module>>();
+  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
+                                      bcFiles)))
+    return std::nullopt;
+  return std::move(bcFiles);
+}
+
+gpu::GPUModuleOp SerializeGPUModuleBase::getGPUModuleOp() {
+  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
+}
+
+// There is 1 way to finalize IL to native code: IGC
+// There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime).
+// - L0 runtime consumes IL and is external to MLIR codebase (rt wrappers).
+// - `ocloc` tool can be "queried" from within MLIR.
+std::optional<SmallVector<char, 0>>
+SerializeGPUModuleBase::compileToBinary(const std::string &asmStr,
+                                        StringRef inputFormat) {
+  using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
+  // Find the `ocloc` tool.
+  std::optional<std::string> oclocCompiler = findTool("ocloc");
+  if (!oclocCompiler)
+    return std::nullopt;
+  Location loc = getGPUModuleOp().getLoc();
+  std::string basename = llvm::formatv(
+      "mlir-{0}-{1}-{2}", getGPUModuleOp().getNameAttr().getValue(),
+      getTarget().getTriple(), getTarget().getChip());
+
+  auto createTemp = [&](StringRef name,
+                        StringRef suffix) -> std::optional<TmpFile> {
+    llvm::SmallString<128> filePath;
+    if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) {
+      getGPUModuleOp().emitError()
+          << "Couldn't create the temp file: `" << filePath
+          << "`, error message: " << ec.message();
+      return std::nullopt;
+    }
+    return TmpFile(filePath, llvm::FileRemover(filePath.c_str()));
+  };
+  // Create temp file
+  std::optional<TmpFile> asmFile = createTemp(basename, "asm");
+  std::optional<TmpFile> binFile = createTemp(basename, "");
+  std::optional<TmpFile> logFile = createTemp(basename, "log");
+  if (!logFile || !asmFile || !binFile)
+    return std::nullopt;
+  // Dump the assembly to a temp file
+  std::error_code ec;
+  {
+    llvm::raw_fd_ostream asmStream(asmFile->first, ec);
+    if (ec) {
+      emitError(loc) << "Couldn't open the file: `" << asmFile->first
+                     << "`, error message: " << ec.message();
+      return std::nullopt;
+    }
+    asmStream << asmStr;
+    if (asmStream.has_error()) {
+      emitError(loc) << "An error occurred while writing the assembly to: `"
+                     << asmFile->first << "`.";
+      return std::nullopt;
+    }
+    asmStream.flush();
+  }
+  // Set cmd options
+  std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
+      targetOptions.tokenizeCmdOptions();
+  // Example: --gpu-module-to-binary="opts='opt1 opt2'"
+  const std::string cmdOptsStr = "\"" + llvm::join(cmdOpts.second, " ") + "\"";
+  SmallVector<StringRef, 12> oclocArgs(
+      {"ocloc", "compile", "-file", asmFile->first, inputFormat, "-device",
+       getTarget().getChip(), "-output", binFile->first, "-output_no_suffix",
+       "-options", cmdOptsStr});
+
+// Dump tool invocation commands.
+#define DEBUG_TYPE "serialize-to-binary"
+  LLVM_DEBUG({
+    llvm::dbgs() << "Tool invocation for module: "
+                 << getGPUModuleOp().getNameAttr() << "\n";
+    llvm::interleave(oclocArgs, llvm::dbgs(), " ");
+    llvm::dbgs() << "\n";
+  });
+#undef DEBUG_TYPE
+  // Helper function for printing tool error logs.
+  std::string message;
+  auto emitLogError =
+      [&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
+    if (message.empty()) {
+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
+          llvm::MemoryBuffer::getFile(logFile->first);
+      if (toolStderr)
+        emitError(loc) << toolName << " invocation failed. Log:\n"
+                       << toolStderr->get()->getBuffer();
+      else
+        emitError(loc) << toolName << " invocation failed.";
+      return std::nullopt;
+    }
+    emitError(loc) << toolName
+                   << " invocation failed, error message: " << message;
+    return std::nullopt;
+  };
+  std::optional<StringRef> redirects[] = {
+      std::nullopt,
+      logFile->first,
+      logFile->first,
+  };
+  // Invoke ocloc.
+  if (llvm::sys::ExecuteAndWait(oclocCompiler.value(), oclocArgs, std::nullopt,
+                                redirects, 0, 0, &message))
+    return emitLogError("`ocloc`");
+  binFile->first.append(".bin");
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
+      llvm::MemoryBuffer::getFile(binFile->first);
+  if (!binaryBuffer) {
+    emitError(loc) << "Couldn't open the file: `" << binFile->first
+                   << "`, error message: " << binaryBuffer.getError().message();
+    return std::nullopt;
+  }
+  StringRef bin = (*binaryBuffer)->getBuffer();
+  return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
+std::optional<std::string> SerializeGPUModuleBase::findTool(StringRef tool) {
+  // 1. Check the toolkit path given in the command line.
+  StringRef pathRef = targetOptions.getToolkitPath();
+  SmallVector<char, 256> path;
+  if (!pathRef.empty()) {
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    llvm::sys::path::append(path, "bin", tool);
+    if (llvm::sys::fs::can_execute(path))
+      return StringRef(path.data(), path.size()).str();
+  }
+  // 2. Check PATH.
+  if (std::optional<std::string> toolPath =
+          llvm::sys::Process::FindInEnvPath("PATH", tool))
+    return *toolPath;
+
+  getGPUModuleOp().emitError()
+      << "Couldn't find the `" << tool
+      << "` binary. Please specify the toolkit "
+         "path via GpuModuleToBinaryPass or add the compiler to $PATH`.";
+  return std::nullopt;
+}
+
+namespace {
+class SPIRVSerializer : public SerializeGPUModuleBase {
+public:
+  SPIRVSerializer(Operation &module, XeVMTargetAttr xeTarget,
+                  const gpu::TargetOptions &targetOptions)
+      : SerializeGPUModuleBase(module, xeTarget, targetOptions) {}
+
+  static void init();
+
+  /// Serializes the LLVM module to an object format, depending on the
+  /// compilation target selected in target options.
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  /// Translates the LLVM module to SPIR-V binary using LLVM's
+  /// SPIR-V target.
+  std::optional<std::string>
+  translateToSPIRVBinary(llvm::Module &llvmModule,
+                         llvm::TargetMachine &targetMachine);
+};
+} // namespace
+
+void SPIRVSerializer::init() {
+  static llvm::once_flag initializeBackendOnce;
+  llvm::call_once(initializeBackendOnce, []() {
+#if LLVM_HAS_SPIRV_TARGET
+    LLVMInitializeSPIRVTarget();
+    LLVMInitializeSPIRVTargetInfo();
+    LLVMInitializeSPIRVTargetMC();
+    LLVMInitializeSPIRVAsmPrinter();
+#endif
+  });
+}
+
+std::optional<SmallVector<char, 0>>
+SPIRVSerializer::moduleToObject(llvm::Module &llvmModule) {
+#define DEBUG_TYPE "serialize-to-llvm"
+  LLVM_DEBUG({
+    llvm::dbgs() << "LLVM IR for module: " << getGPUModuleOp().getNameAttr()
+                 << "\n";
+    llvm::dbgs() << llvmModule << "\n";
+    llvm::dbgs().flush();
+  });
+#undef DEBUG_TYPE
+
+  // Return LLVM IR if the compilation target is `offload`.
+  if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
+    return SerializeGPUModuleBase::moduleToObject(llvmModule);
+
+#if !LLVM_HAS_SPIRV_TARGET
+  getGPUModuleOp()->emitError("The `SPIRV` target was not built. Please enable "
+                              "it when building LLVM.");
+  return std::nullopt;
+#endif // LLVM_HAS_SPIRV_TARGET
+
+  std::optional<llvm::TargetMachine *> targetMachine =
+      getOrCreateTargetMachine();
+  if (!targetMachine) {
+    getGPUModuleOp().emitError() << "Target Machine unavailable for triple "
+                                 << triple << ", can't optimize with LLVM\n";
+    return std::nullopt;
+  }
+
+  // Return SPIRV if the compilation target is `assembly`.
+  if (targetOptions.getCompilationTarget() ==
+      gpu::CompilationTarget::Assembly) {
+    std::optional<std::string> serializedISA =
+        translateToISA(llvmModule, **targetMachine);
+    if (!serializedISA) {
+      getGPUModuleOp().emitError() << "Failed translating the module to ISA."
+                                   << triple << ", can't compile with LLVM\n";
+      return std::nullopt;
+    }
+
+#define DEBUG_TYPE "serialize-to-isa"
+    LLVM_DEBUG({
+      llvm::dbgs() << "SPIR-V for module: " << getGPUModuleOp().getNameAttr()
+                   << "\n";
+      llvm::dbgs() << *serializedISA << "\n";
+      llvm::dbgs().flush();
+    });
+#undef DEBUG_TYPE
+
+    // Make sure to include the null terminator.
+    StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
+    return SmallVector<char, 0>(bin.begin(), bin.end());
+  }
+
+  // Level zero runtime is set up to accept SPIR-V binary
+  // translateToSPIRVBinary translates the LLVM module to SPIR-V binary
+  // using LLVM's SPIRV target.
+  // compileToBinary can be used in the future if level zero runtime
+  // implementation switches to native XeVM binary format.
+  std::optional<std::string> serializedSPIRVBinary =
+      translateToSPIRVBinary(llvmModule, **targetMachine);
+  if (!serializedSPIRVBinary) {
+    getGPUModuleOp().emitError() << "Failed translating the module to Binary.";
+    return std::nullopt;
+  }
+  if (serializedSPIRVBinary->size() % 4) {
+    getGPUModuleOp().emitError() << "SPIRV code size must be a multiple of 4.";
+    return std::nullopt;
+  }
+  StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
+  return SmallVector<char, 0>(bin.begin(), bin.end());
+}
+
+std::optional<std::string>
+SPIRVSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
+                                        llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CodeGenFileType::ObjectFile))
+      return std::nullopt;
+
+    codegenPasses.run(llvmModule);
+  }
+  return targetISA;
+}
+
+std::optional<SmallVector<char, 0>>
+XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
+                                      const gpu::TargetOptions &options) const {
+  if (!module)
+    return std::nullopt;
+  auto gpuMod = dyn_cast<gpu::GPUModuleOp>(module);
+  if (!gpuMod) {
+    module->emitError("expected to be a gpu.module op");
+    return std::nullopt;
+  }
+  auto xeTarget = cast<XeVMTargetAttr>(attribute);
+  if (xeTarget.getTriple().starts_with("spirv")) {
+    gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
+      if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
+        funcOp.setIntelReqdSubGroupSize(16);
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+    SPIRVSerializer serializer(*module, cast<XeVMTargetAttr>(attribute),
+                               options);
+    serializer.init();
+
+#if !LLVM_HAS_SPIRV_TARGET
+    module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
+                      "without having the target built.");
+#endif
+
+    return serializer.run();
+  }
+  module->emitError("Unsupported XeVM target triple: ") << xeTarget.getTriple();
+  return std::nullopt;
+}
+
+Attribute
+XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
+                                 const SmallVector<char, 0> &object,
+                                 const gpu::TargetOptions &options) const {
+  Builder builder(attribute.getContext());
+  gpu::CompilationTarget format = options.getCompilationTarget();
+  auto xeTarget = cast<XeVMTargetAttr>(attribute);
+  SmallVector<NamedAttribute, 2> properties;
+  if (format == gpu::CompilationTarget::Assembly)
+    properties.push_back(
+        builder.getNamedAttr("O", builder.getI32IntegerAttr(xeTarget.getO())));
+
+  DictionaryAttr objectProps;
+  if (!properties.empty())
+    objectProps = builder.getDictionaryAttr(properties);
+
+  return builder.getAttr<gpu::ObjectAttr>(
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      objectProps, /*kernels=*/nullptr);
+}
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 7c007de..7fc7795 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
+#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Target/SPIRV/SPIRVBinaryUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -112,7 +113,9 @@ LogicalResult Serializer::serialize() {
 
   // TODO: handle the other sections
   processCapability();
-  processExtension();
+  if (failed(processExtension())) {
+    return failure();
+  }
   processMemoryModel();
   processDebugInfo();
 
@@ -204,13 +207,24 @@ void Serializer::processDebugInfo() {
   // TODO: Encode more debug instructions.
 }
 
-void Serializer::processExtension() {
+LogicalResult Serializer::processExtension() {
   llvm::SmallVector<uint32_t, 16> extName;
-  for (spirv::Extension ext : module.getVceTriple()->getExtensions()) {
+  llvm::SmallSet<Extension, 4> deducedExts(
+      llvm::from_range, module.getVceTriple()->getExtensions());
+  auto nonSemanticInfoExt = spirv::Extension::SPV_KHR_non_semantic_info;
+  if (options.emitDebugInfo && !deducedExts.contains(nonSemanticInfoExt)) {
+    TargetEnvAttr targetEnvAttr = lookupTargetEnvOrDefault(module);
+    if (!is_contained(targetEnvAttr.getExtensions(), nonSemanticInfoExt))
+      return module.emitError(
+          "SPV_KHR_non_semantic_info extension not available");
+    deducedExts.insert(nonSemanticInfoExt);
+  }
+  for (spirv::Extension ext : deducedExts) {
     extName.clear();
     spirv::encodeStringLiteralInto(extName, spirv::stringifyExtension(ext));
     encodeInstructionInto(extensions, spirv::Opcode::OpExtension, extName);
   }
+  return success();
 }
 
 void Serializer::processMemoryModel() {
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.h b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
index 7047869..fb2cecd 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.h
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
@@ -102,7 +102,7 @@ private:
 
   void processDebugInfo();
 
-  void processExtension();
+  LogicalResult processExtension();
 
   void processMemoryModel();
 
diff --git a/mlir/lib/Target/SPIRV/TranslateRegistration.cpp b/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
index 4391ee7..796354e 100644
--- a/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
+++ b/mlir/lib/Target/SPIRV/TranslateRegistration.cpp
@@ -106,7 +106,7 @@ serializeModule(spirv::ModuleOp moduleOp, raw_ostream &output,
     int fd = 0;
 
     std::error_code errorCode = llvm::sys::fs::createUniqueFile(
-        options.validationFilePrefix + "%%%%%%", fd, filename);
+        options.validationFilePrefix + "%%%%%%.spv", fd, filename);
     if (errorCode)
       return moduleOp.emitError("error creating validation output file: ")
              << errorCode.message() << "\n";
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 2470f2b..001c13e 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -182,15 +182,24 @@ private:
 /// conversions.)
 static const StringRef kPureTypeConversionMarker = "__pure_type_conversion__";
 
+/// Return the operation that defines all values in the vector. Return nullptr
+/// if the values are not defined by the same operation.
+static Operation *getCommonDefiningOp(const ValueVector &values) {
+  assert(!values.empty() && "expected non-empty value vector");
+  Operation *op = values.front().getDefiningOp();
+  for (Value v : llvm::drop_begin(values)) {
+    if (v.getDefiningOp() != op)
+      return nullptr;
+  }
+  return op;
+}
+
 /// A vector of values is a pure type conversion if all values are defined by
 /// the same operation and the operation has the `kPureTypeConversionMarker`
 /// attribute.
 static bool isPureTypeConversion(const ValueVector &values) {
   assert(!values.empty() && "expected non-empty value vector");
-  Operation *op = values.front().getDefiningOp();
-  for (Value v : llvm::drop_begin(values))
-    if (v.getDefiningOp() != op)
-      return false;
+  Operation *op = getCommonDefiningOp(values);
   return op && op->hasAttr(kPureTypeConversionMarker);
 }
 
@@ -841,7 +850,7 @@ namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : context(ctx), config(config) {}
+      : context(ctx), config(config), notifyingRewriter(ctx, config.listener) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -863,6 +872,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// failure.
   template <typename RewriteTy, typename... Args>
   void appendRewrite(Args &&...args) {
+    assert(config.allowPatternRollback && "appending rewrites is not allowed");
     rewrites.push_back(
         std::make_unique<RewriteTy>(*this, std::forward<Args>(args)...));
   }
@@ -889,15 +899,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   bool wasOpReplaced(Operation *op) const;
 
   /// Lookup the most recently mapped values with the desired types in the
-  /// mapping.
-  ///
-  /// Special cases:
-  /// - If the desired type range is empty, simply return the most recently
-  ///   mapped values.
-  /// - If there is no mapping to the desired types, also return the most
-  ///   recently mapped values.
-  /// - If there is no mapping for the given values at all, return the given
-  ///   value.
+  /// mapping, taking into account only replacements. Perform a best-effort
+  /// search for existing materializations with the desired types.
   ///
   /// If `skipPureTypeConversions` is "true", materializations that are pure
   /// type conversions are not considered.
@@ -1066,6 +1069,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   ConversionValueMapping mapping;
 
   /// Ordered list of block operations (creations, splits, motions).
+  /// This vector is maintained only if `allowPatternRollback` is set to
+  /// "true". Otherwise, all IR rewrites are materialized immediately and no
+  /// bookkeeping is needed.
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
   /// A set of operations that should no longer be considered for legalization.
@@ -1089,6 +1095,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// by the current pattern.
   SetVector<Block *> patternInsertedBlocks;
 
+  /// A list of unresolved materializations that were created by the current
+  /// pattern.
+  DenseSet<UnrealizedConversionCastOp> patternMaterializations;
+
   /// A mapping for looking up metadata of unresolved materializations.
   DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
       unresolvedMaterializations;
@@ -1104,6 +1114,23 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Dialect conversion configuration.
   const ConversionConfig &config;
 
+  /// A set of erased operations. This set is utilized only if
+  /// `allowPatternRollback` is set to "false". Conceptually, this set is
+  /// similar to `replacedOps` (which is maintained when the flag is set to
+  /// "true"). However, erasing from a DenseSet is more efficient than erasing
+  /// from a SetVector.
+  DenseSet<Operation *> erasedOps;
+
+  /// A set of erased blocks. This set is utilized only if
+  /// `allowPatternRollback` is set to "false".
+  DenseSet<Block *> erasedBlocks;
+
+  /// A rewriter that notifies the listener (if any) about all IR
+  /// modifications. This rewriter is utilized only if `allowPatternRollback`
+  /// is set to "false". If the flag is set to "true", the listener is notified
+  /// with a separate mechanism (e.g., in `IRRewrite::commit`).
+  IRRewriter notifyingRewriter;
+
 #ifndef NDEBUG
   /// A set of operations that have pending updates. This tracking isn't
   /// strictly necessary, and is thus only active during debug builds for extra
@@ -1140,11 +1167,8 @@ void BlockTypeConversionRewrite::rollback() {
   getNewBlock()->replaceAllUsesWith(getOrigBlock());
 }
 
-void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
-  Value repl = rewriterImpl.findOrBuildReplacementValue(arg, converter);
-  if (!repl)
-    return;
-
+static void performReplaceBlockArg(RewriterBase &rewriter, BlockArgument arg,
+                                   Value repl) {
   if (isa<BlockArgument>(repl)) {
     rewriter.replaceAllUsesWith(arg, repl);
     return;
@@ -1161,6 +1185,13 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
   });
 }
 
+void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
+  Value repl = rewriterImpl.findOrBuildReplacementValue(arg, converter);
+  if (!repl)
+    return;
+  performReplaceBlockArg(rewriter, arg, repl);
+}
+
 void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase({arg}); }
 
 void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
@@ -1246,6 +1277,30 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 
 ValueVector ConversionPatternRewriterImpl::lookupOrDefault(
     Value from, TypeRange desiredTypes, bool skipPureTypeConversions) const {
+  // Helper function that looks up a single value.
+  auto lookup = [&](const ValueVector &values) -> ValueVector {
+    assert(!values.empty() && "expected non-empty value vector");
+
+    // If the pattern rollback is enabled, use the mapping to look up the
+    // values.
+    if (config.allowPatternRollback)
+      return mapping.lookup(values);
+
+    // Otherwise, look up values by examining the IR. All replacements have
+    // already been materialized in IR.
+    Operation *op = getCommonDefiningOp(values);
+    if (!op)
+      return {};
+    auto castOp = dyn_cast<UnrealizedConversionCastOp>(op);
+    if (!castOp)
+      return {};
+    if (!this->unresolvedMaterializations.contains(castOp))
+      return {};
+    if (castOp.getOutputs() != values)
+      return {};
+    return castOp.getInputs();
+  };
+
   // Helper function that looks up each value in `values` individually and then
   // composes the results. If that fails, it tries to look up the entire vector
   // at once.
@@ -1253,7 +1308,7 @@ ValueVector ConversionPatternRewriterImpl::lookupOrDefault(
     // If possible, replace each value with (one or multiple) mapped values.
     ValueVector next;
     for (Value v : values) {
-      ValueVector r = mapping.lookup({v});
+      ValueVector r = lookup({v});
       if (!r.empty()) {
         llvm::append_range(next, r);
       } else {
@@ -1273,7 +1328,7 @@ ValueVector ConversionPatternRewriterImpl::lookupOrDefault(
     // be stored (and looked up) in the mapping. But for performance reasons,
     // we choose to reuse existing IR (when possible) instead of creating it
     // multiple times.
-    ValueVector r = mapping.lookup(values);
+    ValueVector r = lookup(values);
     if (r.empty()) {
       // No mapping found: The lookup stops here.
       return {};
@@ -1347,15 +1402,8 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state,
 void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep,
                                                  StringRef patternName) {
   for (auto &rewrite :
-       llvm::reverse(llvm::drop_begin(rewrites, numRewritesToKeep))) {
-    if (!config.allowPatternRollback &&
-        !isa<UnresolvedMaterializationRewrite>(rewrite)) {
-      // Unresolved materializations can always be rolled back (erased).
-      llvm::report_fatal_error("pattern '" + patternName +
-                               "' rollback of IR modifications requested");
-    }
+       llvm::reverse(llvm::drop_begin(rewrites, numRewritesToKeep)))
     rewrite->rollback();
-  }
   rewrites.resize(numRewritesToKeep);
 }
 
@@ -1419,12 +1467,12 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
 
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
   // Check to see if this operation is ignored or was replaced.
-  return replacedOps.count(op) || ignoredOps.count(op);
+  return wasOpReplaced(op) || ignoredOps.count(op);
 }
 
 bool ConversionPatternRewriterImpl::wasOpReplaced(Operation *op) const {
   // Check to see if this operation was replaced.
-  return replacedOps.count(op);
+  return replacedOps.count(op) || erasedOps.count(op);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1508,7 +1556,8 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
   // a bit more efficient, so we try to do that when possible.
   bool fastPath = !config.listener;
   if (fastPath) {
-    appendRewrite<InlineBlockRewrite>(newBlock, block, newBlock->end());
+    if (config.allowPatternRollback)
+      appendRewrite<InlineBlockRewrite>(newBlock, block, newBlock->end());
     newBlock->getOperations().splice(newBlock->end(), block->getOperations());
   } else {
     while (!block->empty())
@@ -1556,7 +1605,8 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
     replaceUsesOfBlockArgument(origArg, replArgs, converter);
   }
 
-  appendRewrite<BlockTypeConversionRewrite>(/*origBlock=*/block, newBlock);
+  if (config.allowPatternRollback)
+    appendRewrite<BlockTypeConversionRewrite>(/*origBlock=*/block, newBlock);
 
   // Erase the old block. (It is just unlinked for now and will be erased during
   // cleanup.)
@@ -1585,23 +1635,32 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
   // tracking the materialization like we do for other operations.
   OpBuilder builder(outputTypes.front().getContext());
   builder.setInsertionPoint(ip.getBlock(), ip.getPoint());
-  auto convertOp =
+  UnrealizedConversionCastOp convertOp =
       UnrealizedConversionCastOp::create(builder, loc, outputTypes, inputs);
   if (isPureTypeConversion)
     convertOp->setAttr(kPureTypeConversionMarker, builder.getUnitAttr());
-  if (!valuesToMap.empty())
-    mapping.map(valuesToMap, convertOp.getResults());
+
+  // Register the materialization.
   if (castOp)
     *castOp = convertOp;
   unresolvedMaterializations[convertOp] =
       UnresolvedMaterializationInfo(converter, kind, originalType);
-  appendRewrite<UnresolvedMaterializationRewrite>(convertOp,
-                                                  std::move(valuesToMap));
+  if (config.allowPatternRollback) {
+    if (!valuesToMap.empty())
+      mapping.map(valuesToMap, convertOp.getResults());
+    appendRewrite<UnresolvedMaterializationRewrite>(convertOp,
+                                                    std::move(valuesToMap));
+  } else {
+    patternMaterializations.insert(convertOp);
+  }
   return convertOp.getResults();
 }
 
 Value ConversionPatternRewriterImpl::findOrBuildReplacementValue(
     Value value, const TypeConverter *converter) {
+  assert(config.allowPatternRollback &&
+         "this code path is valid only in rollback mode");
+
   // Try to find a replacement value with the same type in the conversion value
   // mapping. This includes cached materializations. We try to reuse those
   // instead of generating duplicate IR.
@@ -1663,26 +1722,119 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
       logger.getOStream() << " (was detached)";
     logger.getOStream() << "\n";
   });
-  assert(!wasOpReplaced(op->getParentOp()) &&
+
+  // In rollback mode, it is easier to misuse the API, so perform extra error
+  // checking.
+  assert(!(config.allowPatternRollback && wasOpReplaced(op->getParentOp())) &&
          "attempting to insert into a block within a replaced/erased op");
 
+  // In "no rollback" mode, the listener is always notified immediately.
+  if (!config.allowPatternRollback && config.listener)
+    config.listener->notifyOperationInserted(op, previous);
+
   if (wasDetached) {
-    // If the op was detached, it is most likely a newly created op.
-    // TODO: If the same op is inserted multiple times from a detached state,
-    // the rollback mechanism may erase the same op multiple times. This is a
-    // bug in the rollback-based dialect conversion driver.
-    appendRewrite<CreateOperationRewrite>(op);
+    // If the op was detached, it is most likely a newly created op. Add it the
+    // set of newly created ops, so that it will be legalized. If this op is
+    // not a newly created op, it will be legalized a second time, which is
+    // inefficient but harmless.
     patternNewOps.insert(op);
+
+    if (config.allowPatternRollback) {
+      // TODO: If the same op is inserted multiple times from a detached
+      // state, the rollback mechanism may erase the same op multiple times.
+      // This is a bug in the rollback-based dialect conversion driver.
+      appendRewrite<CreateOperationRewrite>(op);
+    } else {
+      // In "no rollback" mode, there is an extra data structure for tracking
+      // erased operations that must be kept up to date.
+      erasedOps.erase(op);
+    }
     return;
   }
 
   // The op was moved from one place to another.
-  appendRewrite<MoveOperationRewrite>(op, previous);
+  if (config.allowPatternRollback)
+    appendRewrite<MoveOperationRewrite>(op, previous);
+}
+
+/// Given that `fromRange` is about to be replaced with `toRange`, compute
+/// replacement values with the types of `fromRange`.
+static SmallVector<Value>
+getReplacementValues(ConversionPatternRewriterImpl &impl, ValueRange fromRange,
+                     const SmallVector<SmallVector<Value>> &toRange,
+                     const TypeConverter *converter) {
+  assert(!impl.config.allowPatternRollback &&
+         "this code path is valid only in 'no rollback' mode");
+  SmallVector<Value> repls;
+  for (auto [from, to] : llvm::zip_equal(fromRange, toRange)) {
+    if (from.use_empty()) {
+      // The replaced value is dead. No replacement value is needed.
+      repls.push_back(Value());
+      continue;
+    }
+
+    if (to.empty()) {
+      // The replaced value is dropped. Materialize a replacement value "out of
+      // thin air".
+      Value srcMat = impl.buildUnresolvedMaterialization(
+          MaterializationKind::Source, computeInsertPoint(from), from.getLoc(),
+          /*valuesToMap=*/{}, /*inputs=*/ValueRange(),
+          /*outputTypes=*/from.getType(), /*originalType=*/Type(),
+          converter)[0];
+      repls.push_back(srcMat);
+      continue;
+    }
+
+    if (TypeRange(ValueRange(to)) == TypeRange(from.getType())) {
+      // The replacement value already has the correct type. Use it directly.
+      repls.push_back(to[0]);
+      continue;
+    }
+
+    // The replacement value has the wrong type. Build a source materialization
+    // to the original type.
+    // TODO: This is a bit inefficient. We should try to reuse existing
+    // materializations if possible. This would require an extension of the
+    // `lookupOrDefault` API.
+    Value srcMat = impl.buildUnresolvedMaterialization(
+        MaterializationKind::Source, computeInsertPoint(to), from.getLoc(),
+        /*valuesToMap=*/{}, /*inputs=*/to, /*outputTypes=*/from.getType(),
+        /*originalType=*/Type(), converter)[0];
+    repls.push_back(srcMat);
+  }
+
+  return repls;
 }
 
 void ConversionPatternRewriterImpl::replaceOp(
     Operation *op, SmallVector<SmallVector<Value>> &&newValues) {
-  assert(newValues.size() == op->getNumResults());
+  assert(newValues.size() == op->getNumResults() &&
+         "incorrect number of replacement values");
+
+  if (!config.allowPatternRollback) {
+    // Pattern rollback is not allowed: materialize all IR changes immediately.
+    SmallVector<Value> repls = getReplacementValues(
+        *this, op->getResults(), newValues, currentTypeConverter);
+    // Update internal data structures, so that there are no dangling pointers
+    // to erased IR.
+    op->walk([&](Operation *op) {
+      erasedOps.insert(op);
+      ignoredOps.remove(op);
+      if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op)) {
+        unresolvedMaterializations.erase(castOp);
+        patternMaterializations.erase(castOp);
+      }
+      // The original op will be erased, so remove it from the set of
+      // unlegalized ops.
+      if (config.unlegalizedOps)
+        config.unlegalizedOps->erase(op);
+    });
+    op->walk([&](Block *block) { erasedBlocks.insert(block); });
+    // Replace the op with the replacement values and notify the listener.
+    notifyingRewriter.replaceOp(op, repls);
+    return;
+  }
+
   assert(!ignoredOps.contains(op) && "operation was already replaced");
 
   // Check if replaced op is an unresolved materialization, i.e., an
@@ -1722,11 +1874,46 @@ void ConversionPatternRewriterImpl::replaceOp(
 
 void ConversionPatternRewriterImpl::replaceUsesOfBlockArgument(
     BlockArgument from, ValueRange to, const TypeConverter *converter) {
+  if (!config.allowPatternRollback) {
+    SmallVector<Value> toConv = llvm::to_vector(to);
+    SmallVector<Value> repls =
+        getReplacementValues(*this, from, {toConv}, converter);
+    IRRewriter r(from.getContext());
+    Value repl = repls.front();
+    if (!repl)
+      return;
+
+    performReplaceBlockArg(r, from, repl);
+    return;
+  }
+
   appendRewrite<ReplaceBlockArgRewrite>(from.getOwner(), from, converter);
   mapping.map(from, to);
 }
 
 void ConversionPatternRewriterImpl::eraseBlock(Block *block) {
+  if (!config.allowPatternRollback) {
+    // Pattern rollback is not allowed: materialize all IR changes immediately.
+    // Update internal data structures, so that there are no dangling pointers
+    // to erased IR.
+    block->walk([&](Operation *op) {
+      erasedOps.insert(op);
+      ignoredOps.remove(op);
+      if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op)) {
+        unresolvedMaterializations.erase(castOp);
+        patternMaterializations.erase(castOp);
+      }
+      // The original op will be erased, so remove it from the set of
+      // unlegalized ops.
+      if (config.unlegalizedOps)
+        config.unlegalizedOps->erase(op);
+    });
+    block->walk([&](Block *block) { erasedBlocks.insert(block); });
+    // Erase the block and notify the listener.
+    notifyingRewriter.eraseBlock(block);
+    return;
+  }
+
   assert(!wasOpReplaced(block->getParentOp()) &&
          "attempting to erase a block within a replaced/erased op");
   appendRewrite<EraseBlockRewrite>(block);
@@ -1760,23 +1947,37 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
           logger.getOStream() << " (was detached)";
         logger.getOStream() << "\n";
       });
-  assert(!wasOpReplaced(newParentOp) &&
+
+  // In rollback mode, it is easier to misuse the API, so perform extra error
+  // checking.
+  assert(!(config.allowPatternRollback && wasOpReplaced(newParentOp)) &&
          "attempting to insert into a region within a replaced/erased op");
   (void)newParentOp;
 
+  // In "no rollback" mode, the listener is always notified immediately.
+  if (!config.allowPatternRollback && config.listener)
+    config.listener->notifyBlockInserted(block, previous, previousIt);
+
   patternInsertedBlocks.insert(block);
 
   if (wasDetached) {
     // If the block was detached, it is most likely a newly created block.
-    // TODO: If the same block is inserted multiple times from a detached state,
-    // the rollback mechanism may erase the same block multiple times. This is a
-    // bug in the rollback-based dialect conversion driver.
-    appendRewrite<CreateBlockRewrite>(block);
+    if (config.allowPatternRollback) {
+      // TODO: If the same block is inserted multiple times from a detached
+      // state, the rollback mechanism may erase the same block multiple times.
+      // This is a bug in the rollback-based dialect conversion driver.
+      appendRewrite<CreateBlockRewrite>(block);
+    } else {
+      // In "no rollback" mode, there is an extra data structure for tracking
+      // erased blocks that must be kept up to date.
+      erasedBlocks.erase(block);
+    }
     return;
   }
 
   // The block was moved from one place to another.
-  appendRewrite<MoveBlockRewrite>(block, previous, previousIt);
+  if (config.allowPatternRollback)
+    appendRewrite<MoveBlockRewrite>(block, previous, previousIt);
 }
 
 void ConversionPatternRewriterImpl::inlineBlockBefore(Block *source,
@@ -1956,7 +2157,7 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
   // a bit more efficient, so we try to do that when possible.
   bool fastPath = !getConfig().listener;
 
-  if (fastPath)
+  if (fastPath && impl->config.allowPatternRollback)
     impl->inlineBlockBefore(source, dest, before);
 
   // Replace all uses of block arguments.
@@ -1982,6 +2183,11 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
 }
 
 void ConversionPatternRewriter::startOpModification(Operation *op) {
+  if (!impl->config.allowPatternRollback) {
+    // Pattern rollback is not allowed: no extra bookkeeping is needed.
+    PatternRewriter::startOpModification(op);
+    return;
+  }
   assert(!impl->wasOpReplaced(op) &&
          "attempting to modify a replaced/erased op");
 #ifndef NDEBUG
@@ -1991,20 +2197,29 @@ void ConversionPatternRewriter::startOpModification(Operation *op) {
 }
 
 void ConversionPatternRewriter::finalizeOpModification(Operation *op) {
-  assert(!impl->wasOpReplaced(op) &&
-         "attempting to modify a replaced/erased op");
-  PatternRewriter::finalizeOpModification(op);
   impl->patternModifiedOps.insert(op);
+  if (!impl->config.allowPatternRollback) {
+    PatternRewriter::finalizeOpModification(op);
+    if (getConfig().listener)
+      getConfig().listener->notifyOperationModified(op);
+    return;
+  }
 
   // There is nothing to do here, we only need to track the operation at the
   // start of the update.
 #ifndef NDEBUG
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
   assert(impl->pendingRootUpdates.erase(op) &&
          "operation did not have a pending in-place update");
 #endif
 }
 
 void ConversionPatternRewriter::cancelOpModification(Operation *op) {
+  if (!impl->config.allowPatternRollback) {
+    PatternRewriter::cancelOpModification(op);
+    return;
+  }
 #ifndef NDEBUG
   assert(impl->pendingRootUpdates.erase(op) &&
          "operation did not have a pending in-place update");
@@ -2439,17 +2654,23 @@ OperationLegalizer::legalizeWithPattern(Operation *op,
   RewriterState curState = rewriterImpl.getCurrentState();
   auto onFailure = [&](const Pattern &pattern) {
     assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates");
-#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
     if (!rewriterImpl.config.allowPatternRollback) {
-      // Returning "failure" after modifying IR is not allowed.
+      // Erase all unresolved materializations.
+      for (auto op : rewriterImpl.patternMaterializations) {
+        rewriterImpl.unresolvedMaterializations.erase(op);
+        op.erase();
+      }
+      rewriterImpl.patternMaterializations.clear();
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+      // Expensive pattern check that can detect API violations.
       if (checkOp) {
         OperationFingerPrint fingerPrintAfterPattern(checkOp);
         if (fingerPrintAfterPattern != *topLevelFingerPrint)
           llvm::report_fatal_error("pattern '" + pattern.getDebugName() +
                                    "' returned failure but IR did change");
       }
-    }
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+    }
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
     rewriterImpl.patternInsertedBlocks.clear();
@@ -2473,6 +2694,16 @@ OperationLegalizer::legalizeWithPattern(Operation *op,
   // successfully applied.
   auto onSuccess = [&](const Pattern &pattern) {
     assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates");
+    if (!rewriterImpl.config.allowPatternRollback) {
+      // Eagerly erase unused materializations.
+      for (auto op : rewriterImpl.patternMaterializations) {
+        if (op->use_empty()) {
+          rewriterImpl.unresolvedMaterializations.erase(op);
+          op.erase();
+        }
+      }
+      rewriterImpl.patternMaterializations.clear();
+    }
     SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps);
     SetVector<Operation *> modifiedOps =
         moveAndReset(rewriterImpl.patternModifiedOps);
@@ -2563,6 +2794,9 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
   // If the pattern moved or created any blocks, make sure the types of block
   // arguments get legalized.
   for (Block *block : insertedBlocks) {
+    if (impl.erasedBlocks.contains(block))
+      continue;
+
     // Only check blocks outside of the current operation.
     Operation *parentOp = block->getParentOp();
     if (!parentOp || parentOp == op || block->getNumArguments() == 0)
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index 83bdbe1..ba12ff2 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=arith" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=arith allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @vector_ops
 func.func @vector_ops(%arg0: vector<4xf32>, %arg1: vector<4xi1>, %arg2: vector<4xi64>, %arg3: vector<4xi64>) -> vector<4xf32> {
@@ -373,12 +374,11 @@ func.func @integer_extension_and_truncation(%arg0 : i3) {
 
 // CHECK-LABEL: @integer_cast_0d_vector
 func.func @integer_cast_0d_vector(%arg0 : vector<i3>) {
-// CHECK: %[[ARG0:.*]] = builtin.unrealized_conversion_cast
-// CHECK-NEXT: = llvm.sext %[[ARG0]] : vector<1xi3> to vector<1xi6>
+// CHECK: = llvm.sext %{{.*}}: vector<1xi3> to vector<1xi6>
   %0 = arith.extsi %arg0 : vector<i3> to vector<i6>
-// CHECK-NEXT: = llvm.zext %[[ARG0]] : vector<1xi3> to vector<1xi6>
+// CHECK-NEXT: = llvm.zext %{{.*}} : vector<1xi3> to vector<1xi6>
   %1 = arith.extui %arg0 : vector<i3> to vector<i6>
-// CHECK-NEXT: = llvm.trunc %[[ARG0]] : vector<1xi3> to vector<1xi2>
+// CHECK-NEXT: = llvm.trunc %{{.*}} : vector<1xi3> to vector<1xi2>
   %2 = arith.trunci %arg0 : vector<i3> to vector<i2>
   return
 }
diff --git a/mlir/test/Conversion/ComplexToLLVM/convert-to-llvm.mlir b/mlir/test/Conversion/ComplexToLLVM/convert-to-llvm.mlir
index ad1b665..4d2c12a 100644
--- a/mlir/test/Conversion/ComplexToLLVM/convert-to-llvm.mlir
+++ b/mlir/test/Conversion/ComplexToLLVM/convert-to-llvm.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=complex" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=complex allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func @complex_create
 // CHECK-SAME:    (%[[REAL0:.*]]: f32, %[[IMAG0:.*]]: f32)
@@ -23,9 +24,9 @@ func.func @complex_constant() -> complex<f64> {
 
 // CHECK-LABEL: func @complex_extract
 // CHECK-SAME:    (%[[CPLX:.*]]: complex<f32>)
-// CHECK-NEXT:    %[[CAST0:.*]] = builtin.unrealized_conversion_cast %[[CPLX]] : complex<f32> to !llvm.struct<(f32, f32)>
-// CHECK-NEXT:    %[[REAL:.*]] = llvm.extractvalue %[[CAST0]][0] : !llvm.struct<(f32, f32)>
-// CHECK-NEXT:    %[[IMAG:.*]] = llvm.extractvalue %[[CAST0]][1] : !llvm.struct<(f32, f32)>
+// CHECK:    builtin.unrealized_conversion_cast %[[CPLX]] : complex<f32> to !llvm.struct<(f32, f32)>
+// CHECK:    %[[REAL:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(f32, f32)>
+// CHECK:    %[[IMAG:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(f32, f32)>
 func.func @complex_extract(%cplx: complex<f32>) {
   %real1 = complex.re %cplx : complex<f32>
   %imag1 = complex.im %cplx : complex<f32>
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir b/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
index 3ec8f1f..18d0526 100644
--- a/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
+++ b/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=cf" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=cf allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 func.func @main() {
   %a = arith.constant 0 : i1
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 2113557..94dfcea 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -9,6 +9,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=arith,cf,func,math" %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=arith,cf,func,math allow-pattern-rollback=0" %s | FileCheck %s
 
 // CHECK-LABEL: func @empty() {
 // CHECK-NEXT:  llvm.return
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-target-attr.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-target-attr.mlir
index ed7fa65..0016db5 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-target-attr.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-target-attr.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s --pass-pipeline="builtin.module(gpu.module(convert-to-llvm{dynamic=true}))" | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(gpu.module(convert-to-llvm{dynamic=true allow-pattern-rollback=0}))" | FileCheck %s
 
 // CHECK-LABEL: gpu.module @nvvm_module
 gpu.module @nvvm_module [#nvvm.target] {
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2b6adff..fa4a974 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -54,8 +54,8 @@ gpu.module @test_module {
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %gDimZ = gpu.grid_dim z
 
-    // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} : (i32, i32) -> i32
-    // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} : (i32, i32) -> i32
+    // CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
+    // CHECK: = rocdl.mbcnt.hi %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 64>}]} : (i32, i32) -> i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %laneId = gpu.lane_id
 
@@ -701,7 +701,7 @@ gpu.module @test_module {
     // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
     // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
     // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
-    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
     // *** UP mode shuffle ***
     // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
     // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
diff --git a/mlir/test/Conversion/IndexToLLVM/index-to-llvm.mlir b/mlir/test/Conversion/IndexToLLVM/index-to-llvm.mlir
index 26abb3b..007929e 100644
--- a/mlir/test/Conversion/IndexToLLVM/index-to-llvm.mlir
+++ b/mlir/test/Conversion/IndexToLLVM/index-to-llvm.mlir
@@ -5,6 +5,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=index" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=index allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @trivial_ops
 func.func @trivial_ops(%a: index, %b: index) {
diff --git a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
index 9290408..f454122 100644
--- a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
+++ b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=math" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=math allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @ops
 func.func @ops(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32, %arg4: f64) {
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir
new file mode 100644
index 0000000..c1627a0
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-alloc-copy.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt -convert-memref-to-emitc="lower-to-cpp=true" %s -split-input-file | FileCheck %s --check-prefix=CPP
+// RUN: mlir-opt -convert-memref-to-emitc="lower-to-cpp=false" %s -split-input-file | FileCheck %s --check-prefix=NOCPP
+
+func.func @alloc_copy(%arg0: memref<999xi32>) {
+  %alloc = memref.alloc() : memref<999xi32>
+  memref.copy %arg0, %alloc : memref<999xi32> to memref<999xi32>
+  %alloc_1 = memref.alloc() : memref<999xi32>
+  memref.copy %arg0, %alloc_1 : memref<999xi32> to memref<999xi32>
+  return
+} 
+
+// CHECK: module {
+// NOCPP:  emitc.include <"stdlib.h">
+// NOCPP-NEXT:  emitc.include <"string.h">
+
+// CPP:  emitc.include <"cstdlib">
+// CPP-NEXT:  emitc.include <"cstring">
+
+// CHECK-LABEL: alloc_copy
+// CHECK-SAME: %[[arg0:.*]]: memref<999xi32>
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %arg0 : memref<999xi32> to !emitc.array<999xi32> 
+// CHECK-NEXT:  emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t 
+// CHECK-NEXT:  "emitc.constant"() <{value = 999 : index}> : () -> index 
+// CHECK-NEXT:  emitc.mul %1, %2 : (!emitc.size_t, index) -> !emitc.size_t 
+// CHECK-NEXT:  emitc.call_opaque "malloc"(%3) : (!emitc.size_t) -> !emitc.ptr<!emitc.opaque<"void">> 
+// CHECK-NEXT:  emitc.cast %4 : !emitc.ptr<!emitc.opaque<"void">> to !emitc.ptr<i32> 
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %5 : !emitc.ptr<i32> to !emitc.array<999xi32> 
+// CHECK-NEXT:  "emitc.constant"() <{value = 0 : index}> : () -> index 
+// CHECK-NEXT:  emitc.subscript %0[%7] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32> 
+// CHECK-NEXT:  emitc.apply "&"(%8) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32> 
+// CHECK-NEXT:  emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
+// CHECK-NEXT:  "emitc.constant"() <{value = 999 : index}> : () -> index
+// CHECK-NEXT:  emitc.mul %12, %13 : (!emitc.size_t, index) -> !emitc.size_t
+// CHECK-NEXT:  emitc.call_opaque "memcpy"(%11, %9, %14) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
+// CHECK-NEXT:  emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
+// CHECK-NEXT:  "emitc.constant"() <{value = 999 : index}> : () -> index
+// CHECK-NEXT:  emitc.mul %15, %16 : (!emitc.size_t, index) -> !emitc.size_t
+// CHECK-NEXT:  emitc.call_opaque "malloc"(%17) : (!emitc.size_t) -> !emitc.ptr<!emitc.opaque<"void">>
+// CHECK-NEXT:  emitc.cast %18 : !emitc.ptr<!emitc.opaque<"void">> to !emitc.ptr<i32>
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %19 : !emitc.ptr<i32> to !emitc.array<999xi32>
+// CHECK-NEXT:  "emitc.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:  emitc.subscript %0[%21] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
+// CHECK-NEXT:  emitc.apply "&"(%22) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK-NEXT:  emitc.subscript %20[%21] : (!emitc.array<999xi32>, index) -> !emitc.lvalue<i32>
+// CHECK-NEXT:  emitc.apply "&"(%24) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+// CHECK-NEXT:  emitc.call_opaque "sizeof"() {args = [i32]} : () -> !emitc.size_t
+// CHECK-NEXT:  "emitc.constant"() <{value = 999 : index}> : () -> index
+// CHECK-NEXT:  emitc.mul %26, %27 : (!emitc.size_t, index) -> !emitc.size_t
+// CHECK-NEXT:  emitc.call_opaque "memcpy"(%25, %23, %28) : (!emitc.ptr<i32>, !emitc.ptr<i32>, !emitc.size_t) -> ()
+// CHECK-NEXT:    return
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir
new file mode 100644
index 0000000..d151d1b
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-copy.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-opt -convert-memref-to-emitc="lower-to-cpp=true" %s -split-input-file | FileCheck %s --check-prefix=CPP
+// RUN: mlir-opt -convert-memref-to-emitc="lower-to-cpp=false" %s -split-input-file | FileCheck %s --check-prefix=NOCPP
+
+func.func @copying(%arg0 : memref<9x4x5x7xf32>, %arg1 : memref<9x4x5x7xf32>) {
+  memref.copy %arg0, %arg1 : memref<9x4x5x7xf32> to memref<9x4x5x7xf32>
+  return
+}
+
+// CHECK: module {
+// NOCPP:  emitc.include <"string.h">
+// CPP:  emitc.include <"cstring">
+
+// CHECK-LABEL:  copying
+// CHECK-SAME: %[[arg0:.*]]: memref<9x4x5x7xf32>, %[[arg1:.*]]: memref<9x4x5x7xf32>
+// CHECK-NEXT: %0 = builtin.unrealized_conversion_cast %arg1 : memref<9x4x5x7xf32> to !emitc.array<9x4x5x7xf32>
+// CHECK-NEXT: %1 = builtin.unrealized_conversion_cast %arg0 : memref<9x4x5x7xf32> to !emitc.array<9x4x5x7xf32>
+// CHECK-NEXT: %2 = "emitc.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %3 = emitc.subscript %1[%2, %2, %2, %2] : (!emitc.array<9x4x5x7xf32>, index, index, index, index) -> !emitc.lvalue<f32>
+// CHECK-NEXT: %4 = emitc.apply "&"(%3) : (!emitc.lvalue<f32>) -> !emitc.ptr<f32>
+// CHECK-NEXT: %5 = emitc.subscript %0[%2, %2, %2, %2] : (!emitc.array<9x4x5x7xf32>, index, index, index, index) -> !emitc.lvalue<f32>
+// CHECK-NEXT: %6 = emitc.apply "&"(%5) : (!emitc.lvalue<f32>) -> !emitc.ptr<f32>
+// CHECK-NEXT: %7 = emitc.call_opaque "sizeof"() {args = [f32]} : () -> !emitc.size_t
+// CHECK-NEXT: %8 = "emitc.constant"() <{value = 1260 : index}> : () -> index
+// CHECK-NEXT: %9 = emitc.mul %7, %8 : (!emitc.size_t, index) -> !emitc.size_t
+// CHECK-NEXT: emitc.call_opaque "memcpy"(%6, %4, %9) : (!emitc.ptr<f32>, !emitc.ptr<f32>, !emitc.size_t) -> ()
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+// CHECK-NEXT:}
+
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
index fda0197..b6eccfc 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
@@ -1,13 +1,5 @@
 // RUN: mlir-opt -convert-memref-to-emitc %s -split-input-file -verify-diagnostics
 
-func.func @memref_op(%arg0 : memref<2x4xf32>) {
-  // expected-error@+1 {{failed to legalize operation 'memref.copy'}}
-  memref.copy %arg0, %arg0 : memref<2x4xf32> to memref<2x4xf32>
-  return
-}
-
-// -----
-
 func.func @alloca_with_dynamic_shape() {
   %0 = index.constant 1
   // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index e505767..2487334 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @init_mbarrier
 llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index d69de99..7d8ccd9 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt -convert-openmp-to-llvm -split-input-file %s | FileCheck %s
 // RUN: mlir-opt -convert-to-llvm -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-to-llvm="allow-pattern-rollback=0" -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: llvm.func @foo(i64, i64)
 func.func private @foo(index, index)
diff --git a/mlir/test/Conversion/UBToLLVM/ub-to-llvm.mlir b/mlir/test/Conversion/UBToLLVM/ub-to-llvm.mlir
index 5307e47..6c0b111 100644
--- a/mlir/test/Conversion/UBToLLVM/ub-to-llvm.mlir
+++ b/mlir/test/Conversion/UBToLLVM/ub-to-llvm.mlir
@@ -3,6 +3,7 @@
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=ub" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=ub allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @check_poison
 func.func @check_poison() {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
index 5a424a8..9b57b1b 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt --convert-to-llvm="filter-dialects=vector" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm="filter-dialects=vector allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 // RUN: mlir-opt %s -convert-vector-to-llvm -split-input-file | FileCheck %s
 
 //===========================================================================//
@@ -182,8 +183,7 @@ func.func @shuffle_0D_direct(%arg0: vector<f32>) -> vector<3xf32> {
 }
 // CHECK-LABEL: @shuffle_0D_direct(
 //  CHECK-SAME:     %[[A:.*]]: vector<f32>
-//       CHECK:   %[[c:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<f32> to vector<1xf32>
-//       CHECK:   %[[s:.*]] = llvm.shufflevector %[[c]], %[[c]] [0, 1, 0] : vector<1xf32>
+//       CHECK:   %[[s:.*]] = llvm.shufflevector %{{.*}}, %{{.*}} [0, 1, 0] : vector<1xf32>
 //       CHECK:   return %[[s]] : vector<3xf32>
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-unpack.mlir
index e173d55..a53dde8 100644
--- a/mlir/test/Dialect/Linalg/decompose-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-unpack.mlir
@@ -203,3 +203,20 @@ func.func @unpack_with_non_trailing_dimensions_in_inner_dims(%arg0: tensor<1x1x1
 // CHECK-SAME:                      outs(%[[EMPTY]] : tensor<1x4xf32>) permutation = [1, 0]
 // CHECK:        %[[INSERT:.+]] = tensor.insert_slice %transposed into %[[DEST]][0, 0, 0] [1, 1, 4] [1, 1, 1] : tensor<1x4xf32> into tensor<1x1x4xf32>
 // CHECK:        return %[[INSERT]]
+
+// -----
+
+/// Note "126", which is a non-unit tile-outer-dim. This is not supported.
+
+func.func @negative_non_unit_tiled_outer_dim(%src: tensor<1x126x1x1x8xf32>, %dest: tensor<1x1x1x1001xf32>) -> tensor<1x1x1x1001xf32> {
+  %unpack = linalg.unpack %src
+    outer_dims_perm = [0, 3, 2, 1]
+    inner_dims_pos = [3]
+    inner_tiles = [8]
+    into %dest : tensor<1x126x1x1x8xf32>
+    -> tensor<1x1x1x1001xf32>
+
+  return %unpack : tensor<1x1x1x1001xf32>
+}
+// CHECK-LABEL: @negative_non_unit_tiled_outer_dim(
+// CHECK: linalg.unpack
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 095810f..01eb210 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -880,22 +880,22 @@ func.func @mmt4d_scalable(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>,
 // CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>,
 // CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
 // CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 16 : index
+// CHECK:           %[[C16_M:.*]] = arith.constant 16 : index
+// CHECK:           %[[C16_N:.*]] = arith.constant 16 : index
+// CHECK:           %[[C16_K:.*]] = arith.constant 16 : index
 // CHECK:           %[[C8:.*]] = arith.constant 8 : index
 // CHECK:           %[[C2:.*]] = arith.constant 2 : index
 // CHECK:           %[[DIM_2:.*]] = memref.dim %[[B]], %[[C2]] : memref<16x16x?x1xf32>
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[MASK_1:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_2]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x[4]x1xi1>
+// CHECK:           %[[MASK_1:.*]] = vector.create_mask %[[C16_N]], %[[C16_K]], %[[DIM_2]], %[[C1]] : vector<16x16x[4]x1xi1>
 // CHECK:           %[[VEC_B:.*]] = vector.mask %[[MASK_1]] { vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32> } : vector<16x16x[4]x1xi1> -> vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[MASK_2:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[C8]], %[[DIM_2]] : vector<16x16x8x[4]xi1>
-// CHECK:           %[[VAL_15:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> } : vector<16x16x8x[4]xi1> -> vector<16x16x8x[4]xf32>
-// CHECK:           %[[VAL_16:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[MASK_3:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[C8]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x16x8x[4]x1xi1>
-// CHECK:           %[[VAL_18:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction <add>, %[[VAL_16]], %[[VAL_15]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> } : vector<16x16x16x8x[4]x1xi1> -> vector<16x16x8x[4]xf32>
-// CHECK:           vector.mask %[[MASK_2]] { vector.transfer_write %[[VAL_18]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> } : vector<16x16x8x[4]xi1>
+// CHECK:           %[[MASK_2:.*]] = vector.create_mask %[[C16_M]], %[[C16_N]], %[[C8]], %[[DIM_2]] : vector<16x16x8x[4]xi1>
+// CHECK:           %[[VEC_C:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> } : vector<16x16x8x[4]xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_3:.*]] = vector.create_mask %[[C16_M]], %[[C16_N]], %[[C16_K]], %[[C8]], %[[DIM_2]], %[[C1]] : vector<16x16x16x8x[4]x1xi1>
+// CHECK:           %[[RED:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> } : vector<16x16x16x8x[4]x1xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           vector.mask %[[MASK_2]] { vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> } : vector<16x16x8x[4]xi1>
 
 
 module attributes {transform.with_named_sequence} {
@@ -920,10 +920,10 @@ func.func @mmt4d_scalable_with_assume(%A: memref<16x16x8x1xf32>, %B: memref<16x1
 // CHECK-NOT:       mask
 // CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
 // CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VAL_13:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
-// CHECK:           %[[VAL_14:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VAL_15:.*]] = vector.multi_reduction <add>, %[[VAL_14]], %[[VAL_13]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32>
-// CHECK:           vector.transfer_write %[[VAL_15]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -936,6 +936,100 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 ///----------------------------------------------------------------------------------------
+/// Tests for linalg.batch_mmt4d
+///----------------------------------------------------------------------------------------
+
+func.func @batch_mmt4d(%A: memref<2x16x16x8x1xf32>, %B: memref<2x16x16x8x1xf32>, %C_in: memref<2x16x16x8x8xf32>) {
+  linalg.batch_mmt4d ins(%A, %B: memref<2x16x16x8x1xf32>, memref<2x16x16x8x1xf32>)
+               outs(%C_in: memref<2x16x16x8x8xf32>)
+  return
+}
+
+// CHECK-LABEL:   func.func @batch_mmt4d(
+// CHECK-SAME:      %[[A:.*]]: memref<2x16x16x8x1xf32>, %[[B:.*]]: memref<2x16x16x8x1xf32>, %[[C:.*]]: memref<2x16x16x8x8xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<2x16x16x8x8xf32>, vector<2x16x16x8x8xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<2x16x16x16x8x8x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [3, 6] : vector<2x16x16x16x8x8x1xf32> to vector<2x16x16x8x8xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<2x16x16x8x8xf32>, memref<2x16x16x8x8xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %batch_mmt4d = transform.structured.match ops{["linalg.batch_mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %batch_mmt4d : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @batch_mmt4d_scalable(%A: memref<2x16x16x8x1xf32>, %B: memref<2x16x16x?x1xf32>, %C_in: memref<2x16x16x8x?xf32>) {
+  linalg.batch_mmt4d ins(%A, %B: memref<2x16x16x8x1xf32>, memref<2x16x16x?x1xf32>)
+               outs(%C_in: memref<2x16x16x8x?xf32>)
+  return
+}
+// CHECK-LABEL:   func.func @batch_mmt4d_scalable(
+// CHECK-SAME:      %[[A:.*]]: memref<2x16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<2x16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<2x16x16x8x?xf32>) {
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[C16_M:.*]] = arith.constant 16 : index
+// CHECK:           %[[C16_N:.*]] = arith.constant 16 : index
+// CHECK:           %[[C16_K:.*]] = arith.constant 16 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C3:.*]] = arith.constant 3 : index
+// CHECK:           %[[DIM_N_IN:.*]] = memref.dim %[[B]], %[[C3]] : memref<2x16x16x?x1xf32>
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_1:.*]] = vector.create_mask %[[C2]], %[[C16_N]], %[[C16_K]], %[[DIM_N_IN]], %[[C1]] : vector<2x16x16x[4]x1xi1>
+// CHECK:           %[[VEC_B:.*]] = vector.mask %[[MASK_1]] { vector.transfer_read %[[B]]{{.*}} : memref<2x16x16x?x1xf32>, vector<2x16x16x16x8x[4]x1xf32> } : vector<2x16x16x[4]x1xi1> -> vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_2:.*]] = vector.create_mask %[[C2]], %[[C16_M]], %[[C16_N]], %[[C8]], %[[DIM_N_IN]] : vector<2x16x16x8x[4]xi1>
+// CHECK:           %[[VEC_C:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<2x16x16x8x?xf32>, vector<2x16x16x8x[4]xf32> } : vector<2x16x16x8x[4]xi1> -> vector<2x16x16x8x[4]xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_3:.*]] = vector.create_mask %[[C2]], %[[C16_M]], %[[C16_N]], %[[C16_K]], %[[C8]], %[[DIM_N_IN]], %[[C1]] : vector<2x16x16x16x8x[4]x1xi1>
+// CHECK:           %[[RED:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [3, 6] : vector<2x16x16x16x8x[4]x1xf32> to vector<2x16x16x8x[4]xf32> } : vector<2x16x16x16x8x[4]x1xi1> -> vector<2x16x16x8x[4]xf32>
+// CHECK:           vector.mask %[[MASK_2]] { vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<2x16x16x8x[4]xf32>, memref<2x16x16x8x?xf32> } : vector<2x16x16x8x[4]xi1>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %batch_mmt4d = transform.structured.match ops{["linalg.batch_mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %batch_mmt4d vector_sizes [2, 16, 16, 16, 8, [4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @batch_mmt4d_scalable_with_assume(%A: memref<2x16x16x8x1xf32>, %B: memref<2x16x16x?x1xf32>, %C_in: memref<2x16x16x8x?xf32>) {
+  linalg.batch_mmt4d ins(%A, %B: memref<2x16x16x8x1xf32>, memref<2x16x16x?x1xf32>)
+               outs(%C_in: memref<2x16x16x8x?xf32>)
+  return
+}
+// CHECK-LABEL:   func.func @batch_mmt4d_scalable_with_assume(
+// CHECK-SAME:      %[[A:.*]]: memref<2x16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<2x16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<2x16x16x8x?xf32>) {
+// CHECK-NOT:       mask
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<2x16x16x?x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<2x16x16x8x?xf32>, vector<2x16x16x8x[4]xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [3, 6] : vector<2x16x16x16x8x[4]x1xf32> to vector<2x16x16x8x[4]xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<2x16x16x8x[4]xf32>, memref<2x16x16x8x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %batch_mmt4d = transform.structured.match ops{["linalg.batch_mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %batch_mmt4d vector_sizes [2, 16, 16, 16, 8, [4], 1] {assume_dynamic_dims_match_vec_sizes} : !transform.any_op
+    transform.yield
+  }
+}
+
+
+// -----
+
+///----------------------------------------------------------------------------------------
 /// Tests for linalg.unpack
 ///----------------------------------------------------------------------------------------
 
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index 2b64fa4..f735e3f 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -378,3 +378,14 @@ func.func @check_matrixC_dim(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %ar
   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<4xf16>) -> vector<2x2xf16>
   return %d : vector<2x2xf16>
 }
+
+// -----
+
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xi8,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
+!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
+func.func @tma_last_dim_bytes(%desc: !desc, %buffer: memref<32x8xi8,3>, %mbarrier: !mbarrier) {
+  %c0 = arith.constant 0 : index
+  // expected-error @+1 {{the bytes in the last dimension of the tensor map must be a multiple of 16}}
+  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer : !desc, !mbarrier -> memref<32x8xi8,3>
+  return
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 308cf150..7804cc7 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1912,3 +1912,16 @@ func.func @index_switch_fold_no_res() {
 
 // CHECK-LABEL: func.func @index_switch_fold_no_res()
 //  CHECK-NEXT: "test.op"() : () -> ()
+
+// -----
+
+// CHECK-LABEL: func @scf_for_all_step_size_0()
+//       CHECK:   scf.forall (%{{.*}}) = (0) to (1) step (0)
+func.func @scf_for_all_step_size_0()  {
+  %x = arith.constant 0 : index
+  scf.forall (%i, %j) = (0, 4) to (1, 5) step (%x, 8) {
+    vector.print %x : index
+    scf.forall.in_parallel {}
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 5150ee3..930bb9f 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -565,6 +565,33 @@ func.func @mul_zero_broadcast(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tenso
 
 // -----
 
+// CHECK-LABEL: @mul_zero_dynamic_nofold
+// CHECK-SAME:                    %[[ARG0:.*]]: tensor<?x17xf32>) -> tensor<?x17xf32> {
+// CHECK:           %[[ZERO:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1xf32>}> : () -> tensor<1x1xf32>
+// CHECK:           %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[MUL:.*]] = tosa.mul %[[ARG0]], %[[ZERO]], %[[SHIFT]] : (tensor<?x17xf32>, tensor<1x1xf32>, tensor<1xi8>) -> tensor<?x17xf32>
+// CHECK:           return %[[MUL]]
+func.func @mul_zero_dynamic_nofold(%arg0: tensor<?x17xf32>) -> tensor<?x17xf32> {
+  %0 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1xf32>}> : () -> tensor<1x1xf32>
+  %1 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+  %2 = tosa.mul %arg0, %0, %1 : (tensor<?x17xf32>, tensor<1x1xf32>, tensor<1xi8>) -> tensor<?x17xf32>
+  return %2 : tensor<?x17xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @mul_one_dynamic_fold
+// CHECK-SAME:                    %[[ARG0:.*]]: tensor<?x17xf32>) -> tensor<?x17xf32> {
+// CHECK:           return %[[ARG0]]
+func.func @mul_one_dynamic_fold(%arg0: tensor<?x17xf32>) -> tensor<?x17xf32> {
+  %0 = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1xf32>}> : () -> tensor<1x1xf32>
+  %1 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+  %2 = tosa.mul %arg0, %0, %1 : (tensor<?x17xf32>, tensor<1x1xf32>, tensor<1xi8>) -> tensor<?x17xf32>
+  return %2 : tensor<?x17xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @select_same_value
 func.func @select_same_value(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = tosa.select %arg0, %arg1, %arg1 : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 180ba8a..f4a49da 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -365,4 +365,11 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
   } {sg_id_range = #xegpu.range<[3, 19]>}
   gpu.return
   }
+
+  // CHECK-LABEL: distribute_constant
+  gpu.func @distribute_constant() {
+    // CHECK: arith.constant dense<1.000000e+00> : vector<32x32xf32>
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} dense<1.0> : vector<256x128xf32>
+    gpu.return
+  }
 }
diff --git a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
index 8f74976..25a338d 100644
--- a/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/assume-alignment-runtime-verification.mlir
@@ -6,6 +6,15 @@
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @main() {
   // This buffer is properly aligned. There should be no error.
   // CHECK-NOT: ^ memref is not aligned to 8
diff --git a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
index 26c731c..4c6a48d 100644
--- a/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/atomic-rmw-runtime-verification.mlir
@@ -5,6 +5,14 @@
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @store_dynamic(%memref: memref<?xf32>, %index: index) {
   %cst = arith.constant 1.0 : f32
   memref.atomic_rmw addf %cst, %memref[%index] : (f32, memref<?xf32>) -> f32
diff --git a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
index 8b6308e..1ac1030 100644
--- a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
@@ -1,11 +1,20 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
-// RUN:     -test-cf-assert \
 // RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
 // RUN:     -convert-to-llvm | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @cast_to_static_dim(%m: memref<?xf32>) -> memref<10xf32> {
   %0 = memref.cast %m : memref<?xf32> to memref<10xf32>
   return %0 : memref<10xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
index 95b9db2..be9417b 100644
--- a/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/copy-runtime-verification.mlir
@@ -6,6 +6,15 @@
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 // Put memref.copy in a function, otherwise the memref.cast may fold.
 func.func @memcpy_helper(%src: memref<?xf32>, %dest: memref<?xf32>) {
   memref.copy %src, %dest : memref<?xf32> to memref<?xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
index 2e3f271..ef4af62 100644
--- a/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/dim-runtime-verification.mlir
@@ -6,6 +6,15 @@
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @main() {
   %c4 = arith.constant 4 : index
   %alloca = memref.alloca() : memref<1xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
index b87e5bd..2e42648 100644
--- a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
@@ -1,12 +1,20 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
-// RUN:     -test-cf-assert \
 // RUN:     -expand-strided-metadata \
-// RUN:     -lower-affine \
+// RUN:     -test-cf-assert \
 // RUN:     -convert-to-llvm | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @load(%memref: memref<1xf32>, %index: index) {
     memref.load %memref[%index] :  memref<1xf32>
     return
diff --git a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
index 12253fa..dd000c6 100644
--- a/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/store-runtime-verification.mlir
@@ -5,6 +5,14 @@
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @store_dynamic(%memref: memref<?xf32>, %index: index) {
   %cst = arith.constant 1.0 : f32
   memref.store %cst, %memref[%index] :  memref<?xf32>
diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
index ec7e408..9fbe5bc 100644
--- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
@@ -1,12 +1,22 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
-// RUN:     -test-cf-assert \
 // RUN:     -expand-strided-metadata \
 // RUN:     -lower-affine \
+// RUN:     -test-cf-assert \
 // RUN:     -convert-to-llvm | \
 // RUN: mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -lower-affine \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @subview(%memref: memref<1xf32>, %offset: index) {
     memref.subview %memref[%offset] [1] [1] : 
         memref<1xf32> to 
diff --git a/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
index e4aab32..f37a6d6 100644
--- a/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/cast-runtime-verification.mlir
@@ -8,6 +8,17 @@
 // RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline=private-function-dynamic-ownership \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func private @cast_to_static_dim(%t: tensor<?xf32>) -> tensor<10xf32> {
   %0 = tensor.cast %t : tensor<?xf32> to tensor<10xf32>
   return %0 : tensor<10xf32>
diff --git a/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
index c6d8f698..e9e5c04 100644
--- a/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/dim-runtime-verification.mlir
@@ -1,10 +1,20 @@
 // RUN: mlir-opt %s -generate-runtime-verification \
-// RUN:     -one-shot-bufferize \
-// RUN:     -buffer-deallocation-pipeline \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline=private-function-dynamic-ownership \
 // RUN:     -test-cf-assert \
 // RUN:     -convert-to-llvm | \
 // RUN: mlir-runner -e main -entry-point-result=void \
-// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline=private-function-dynamic-ownership \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
index 8e3cab7..73fcec4 100644
--- a/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/extract-runtime-verification.mlir
@@ -8,6 +8,17 @@
 // RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline=private-function-dynamic-ownership \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @extract(%tensor: tensor<1xf32>, %index: index) {
     tensor.extract %tensor[%index] :  tensor<1xf32>
     return
diff --git a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
index 28f9be0..341a59e 100644
--- a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir
@@ -8,6 +8,17 @@
 // RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline=private-function-dynamic-ownership \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-to-llvm="allow-pattern-rollback=0" \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%tlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
 func.func @extract_slice(%tensor: tensor<1xf32>, %offset: index) {
     tensor.extract_slice %tensor[%offset] [1] [1] : tensor<1xf32> to tensor<1xf32>
     return
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
new file mode 100644
index 0000000..d0d51c6
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+    config.unsupported = True
+if not config.enable_levelzero_runner:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
new file mode 100644
index 0000000..0bd3d3f
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
@@ -0,0 +1,146 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    // - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size.
+    //   The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics.
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
+
+    gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel {
+      %base_width_a = arith.constant 32 : i32
+      %base_height_a = arith.constant 8 : i32
+      %base_pitch_a = arith.constant 32 : i32
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+
+      %base_width_b = arith.constant 32 : i32
+      %base_height_b = arith.constant 16 : i32
+      %base_pitch_b = arith.constant 32 : i32
+      %loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+      %loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32>
+
+      %base_width_c = arith.constant 64 : i32
+      %base_height_c = arith.constant 8 : i32
+      %base_pitch_c = arith.constant 64 : i32
+      %loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y
+          <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+
+      %loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32>
+      %c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted
+          {shape=<m=8, n=16, k=16>, types=<d=f32, a=f16, b=f16, c=f32>}
+          : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
+      %c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32>
+
+      xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted
+          <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}>
+          : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+  func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+
+    %memref_a = gpu.alloc() : memref<8x16xf16>
+    gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
+    %a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index
+    %a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
+    %a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
+    %a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_b = gpu.alloc() : memref<16x16xf16>
+    gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
+    %b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index
+    %b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64
+    %b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr
+    %b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_c = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
+    %c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index
+    %c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64
+    %c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr
+    %c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
+        args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_a : memref<8x16xf16>
+    gpu.dealloc %memref_b : memref<16x16xf16>
+    %res = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_c : memref<8x16xf32>
+    return %res : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf16>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %row_idx = arith.index_cast %i : index to i32
+        %row = arith.sitofp %row_idx : i32 to f16
+        memref.store %row, %A[%i, %j] : memref<8x16xf16>
+      }
+    }
+    %B = memref.alloc() : memref<16x16xf16>
+    scf.for %i = %c0 to %c16 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %col_idx = arith.index_cast %j : index to i32
+        %col = arith.sitofp %col_idx : i32 to f16
+        memref.store %col, %B[%i, %j] : memref<16x16xf16>
+      }
+    }
+
+    %C = memref.alloc() : memref<8x16xf32>
+    %c0_f16 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32>
+    %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]
+    // CHECK-NEXT: [0,   16,   32,   48,   64,   80,   96,   112,   128,   144,   160,   176,   192,   208,   224,   240]
+    // CHECK-NEXT: [0,   32,   64,   96,   128,   160,   192,   224,   256,   288,   320,   352,   384,   416,   448,   480]
+    // CHECK-NEXT: [0,   48,   96,   144,   192,   240,   288,   336,   384,   432,   480,   528,   576,   624,   672,   720]
+    // CHECK-NEXT: [0,   64,   128,   192,   256,   320,   384,   448,   512,   576,   640,   704,   768,   832,   896,   960]
+    // CHECK-NEXT: [0,   80,   160,   240,   320,   400,   480,   560,   640,   720,   800,   880,   960,   1040,   1120,   1200]
+    // CHECK-NEXT: [0,   96,   192,   288,   384,   480,   576,   672,   768,   864,   960,   1056,   1152,   1248,   1344,   1440]
+    // CHECK-NEXT: [0,   112,   224,   336,   448,   560,   672,   784,   896,   1008,   1120,   1232,   1344,   1456,   1568,   1680]
+
+    memref.dealloc %A : memref<8x16xf16>
+    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %C : memref<8x16xf32>
+    memref.dealloc %C_res : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
new file mode 100644
index 0000000..cea05b8
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+  gpu.module @kernel {
+    // - `cl_intel_subgroups` block load/store intrinsics operate at the *maximum* sub-group size,
+    //     regardless of the active sub-group size. Make sure `clGetKernelSubGroupInfo` meets your expectations.
+    // - The attribute `intel_reqd_sub_group_size` establishes the maximum sub-group size for a kernel.
+    //
+    // Note: launching 16 threads without explicit `intel_reqd_sub_group_size = 16` may still use
+    //       the default sub-group size of 32.
+    //
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_required_subgroup_size.html
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html
+
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 64 : i32 // bytewidth of the block
+      %base_height = arith.constant 8 : i32 // number of rows
+      %base_pitch = arith.constant 64 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      // If `intel_reqd_sub_group_size = 16` is not set, the default (32) is used and this `blockload2d`
+      // would only load 4 elements into vector<8xi32>
+      %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y
+          <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+      %c0 = arith.constant 0 : index
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+      %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[%c0] : f32 into vector<8xf32>
+      %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+      xevm.blockstore2d %dst, %base_width, %base_height, %base_pitch, %x, %y, %loaded_modified
+          <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}>
+          : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+  func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_src, %src : memref<8x16xf32>, memref<8x16xf32>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<8x16xf32> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<8x16xf32>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
+        args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<8x16xf32>
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_dst : memref<8x16xf32>
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.11{{.*}}]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    memref.dealloc %A : memref<8x16xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
new file mode 100644
index 0000000..cb8ab1c
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir
@@ -0,0 +1,131 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 32 : i32 // bytewidth of the block
+      %base_height_load = arith.constant 16 : i32 // number of rows
+      %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+
+      // Consider the following two loads:
+      // Normal load:
+      %loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+      %loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16>
+      %loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+
+      // Register packed load:
+      %loaded_packed = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
+            transpose=false, pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_packed_f16_flat = vector.bitcast %loaded_packed : vector<8xi32> to vector<16xf16>
+      %loaded_packed_f16 = vector.shape_cast %loaded_packed_f16_flat : vector<16xf16> to vector<8x1x2xf16>
+      // Both can be represented the same way in code as vector<16xf16>.
+      // A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register.
+      // Packed load "packs" multiple sub-dword values along the column (↓), allowing a single register
+      // to hold multiple values.
+      //  In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded by packing
+      // to register is logically equivalent to the sequence of values loaded using a normal load.
+      // The load results of both methods can have the same logical representation, but are expected to
+      // differ in physical layout and register efficiency.
+
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16
+      %loaded_f16_modified = vector.insert %thread_x_f16, %loaded_packed_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_packed_f16 and loaded_f16 can be used here
+      // We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores
+      %loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified
+          {offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+      %loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16>
+      %base_height_store = arith.constant 8 : i32 // number of rows
+      %base_width_store = arith.constant 32 : i32 // bytewidth of the block
+      %base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row
+      xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+
+      %loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified
+          {offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
+      %loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16>
+
+      %second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32
+      %second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8
+      xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat
+          <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
+      gpu.return
+    }
+  }
+
+
+  func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<16x16xf16>
+    gpu.memcpy %memref_src, %src : memref<16x16xf16>, memref<16x16xf16>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<16x16xf16>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
+        args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<16x16xf16>
+    %dst = memref.alloc() : memref<16x16xf16>
+    gpu.memcpy %dst, %memref_dst : memref<16x16xf16>, memref<16x16xf16>
+    gpu.dealloc %memref_dst : memref<16x16xf16>
+    return %dst : memref<16x16xf16>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<16x16xf16>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 16 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.1 : f16
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16>
+      }
+    }
+    %B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16>
+    %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+    %A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16>
+    call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
+    call @printMemrefF16(%B_cast) : (memref<*xf16>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.1{{.*}}]
+    // CHECK-COUNT-224: 11.1
+    // CHECK-NEXT: [11.1{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.1{{.*}}]
+    // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    // CHECK-COUNT-208: 11.1
+    // CHECK-NEXT: [11.1{{.*}}]
+
+    memref.dealloc %A : memref<16x16xf16>
+    memref.dealloc %B : memref<16x16xf16>
+    return
+  }
+  func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
new file mode 100644
index 0000000..1d164be
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir
@@ -0,0 +1,133 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel  {
+      %base_width = arith.constant 32 : i32 // bytewidth of the block
+      %base_height = arith.constant 16 : i32 // number of rows
+      %base_pitch = arith.constant 32 : i32 // bytewidth of the base row
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      // Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item
+      // loads a horizontal slice (→).
+      // The tile dimension we want to slice must be a multiple of the sub-group size:
+      //  e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0.
+      %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y
+          <{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32,
+            transpose=true, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+      %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
+
+      %c0 = arith.constant 0 : i32
+      %thread_x = gpu.thread_id x
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
+      %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
+      %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored
+      %loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
+
+      %base_height_store = arith.constant 8 : i32 // number of rows
+      %base_width_store = arith.constant 64 : i32 // bytewidth of the block
+      %base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row
+      // "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓).
+      xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded
+          <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+
+  func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
+    %memref_src = gpu.alloc() : memref<16x8xf32>
+    gpu.memcpy %memref_src, %src : memref<16x8xf32>, memref<16x8xf32>
+    %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index
+    %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
+    %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
+    %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_dst = gpu.alloc() : memref<8x16xf32>
+    %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
+    %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
+    %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
+    %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
+        args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
+    gpu.dealloc %memref_src : memref<16x8xf32>
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_dst : memref<8x16xf32>
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<16x8xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f16
+    scf.for %i = %c0 to %c16 step %c1 {
+      scf.for %j = %c0 to %c8 step %c1 {
+        %c_10_f = arith.constant 10.0 : f32
+        %j_i64 = arith.index_cast %j : index to i64
+        %j_i32 = llvm.trunc %j_i64 : i64 to i32
+        %j_f32 = arith.sitofp %j_i32 : i32 to f32
+        %jj = arith.divf %j_f32, %c_10_f : f32
+
+        %i_i64 = arith.index_cast %i : index to i64
+        %i_i32 = llvm.trunc %i_i64 : i64 to i32
+        %i_f32 = arith.sitofp %i_i32 : i32 to f32
+        %ii = arith.addf %i_f32, %jj : f32
+        memref.store %ii, %A[%i, %j] : memref<16x8xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32>
+    %A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   0.1,   0.2,   0.3,   0.4,   0.5,   0.6,   0.7],
+    // CHECK-NEXT: [1,   1.1,   1.2,   1.3,   1.4,   1.5,   1.6,   1.7],
+    // CHECK-NEXT: [2,   2.1,   2.2,   2.3,   2.4,   2.5,   2.6,   2.7],
+    // CHECK-NEXT: [3,   3.1,   3.2,   3.3,   3.4,   3.5,   3.6,   3.7],
+    // CHECK-NEXT: [4,   4.1,   4.2,   4.3,   4.4,   4.5,   4.6,   4.7],
+    // CHECK-NEXT: [5,   5.1,   5.2,   5.3,   5.4,   5.5,   5.6,   5.7],
+    // CHECK-NEXT: [6,   6.1,   6.2,   6.3,   6.4,   6.5,   6.6,   6.7],
+    // CHECK-NEXT: [7,   7.1,   7.2,   7.3,   7.4,   7.5,   7.6,   7.7],
+    // CHECK-NEXT: [8,   8.1,   8.2,   8.3,   8.4,   8.5,   8.6,   8.7],
+    // CHECK-NEXT: [9,   9.1,   9.2,   9.3,   9.4,   9.5,   9.6,   9.7],
+    // CHECK-NEXT: [10,   10.1,   10.2,   10.3,   10.4,   10.5,   10.6,   10.7],
+    // CHECK-NEXT: [11,   11.1,   11.2,   11.3,   11.4,   11.5,   11.6,   11.7],
+    // CHECK-NEXT: [12,   12.1,   12.2,   12.3,   12.4,   12.5,   12.6,   12.7],
+    // CHECK-NEXT: [13,   13.1,   13.2,   13.3,   13.4,   13.5,   13.6,   13.7],
+    // CHECK-NEXT: [14,   14.1,   14.2,   14.3,   14.4,   14.5,   14.6,   14.7],
+    // CHECK-NEXT: [15,   15.1,   15.2,   15.3,   15.4,   15.5,   15.6,   15.7]
+
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15],
+    // CHECK-NEXT: [0.1,   1.1,   2.1,   3.1,   4.1,   5.1,   6.1,   7.1,   8.1,   9.1,   10.1,   11.1,   12.1,   13.1,   14.1,   15.1],
+    // CHECK-NEXT: [0.2,   1.2,   2.2,   3.2,   4.2,   5.2,   6.2,   7.2,   8.2,   9.2,   10.2,   11.2,   12.2,   13.2,   14.2,   15.2],
+    // CHECK-NEXT: [0.3,   1.3,   2.3,   3.3,   4.3,   5.3,   6.3,   7.3,   8.3,   9.3,   10.3,   11.3,   12.3,   13.3,   14.3,   15.3],
+    // CHECK-NEXT: [0.4,   1.4,   2.4,   3.4,   4.4,   5.4,   6.4,   7.4,   8.4,   9.4,   10.4,   11.4,   12.4,   13.4,   14.4,   15.4],
+    // CHECK-NEXT: [0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,   9.5,   10.5,   11.5,   12.5,   13.5,   14.5,   15.5],
+    // CHECK-NEXT: [0.6,   1.6,   2.6,   3.6,   4.6,   5.6,   6.6,   7.6,   8.6,   9.6,   10.6,   11.6,   12.6,   13.6,   14.6,   15.6],
+    // CHECK-NEXT: [0.7,   1.7,   2.7,   3.7,   4.7,   5.7,   6.7,   7.7,   8.7,   9.7,   10.7,   11.7,   12.7,   13.7,   14.7,   15.7]
+
+    memref.dealloc %A : memref<16x8xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
new file mode 100644
index 0000000..c5f4cd5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+
+  gpu.module @kernel {
+    gpu.func @store_constant(%ptr: !llvm.ptr<1>) kernel {
+      %const_val = arith.constant 42.0 : f32
+      %thread_x = gpu.lane_id
+      %thread_x_i64 = arith.index_cast %thread_x : index to i64
+      %ptr_next_1 = llvm.getelementptr %ptr[%thread_x_i64] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, i32
+      llvm.store %const_val, %ptr_next_1 : f32, !llvm.ptr<1>
+      gpu.return
+    }
+  }
+  func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %memref_0 = gpu.alloc() : memref<8x16xf32>
+    gpu.memcpy %memref_0, %src : memref<8x16xf32>, memref<8x16xf32>
+    %0 = memref.extract_aligned_pointer_as_index %memref_0 : memref<8x16xf32> -> index
+    %1 = arith.index_cast %0 : index to i64
+    %2 = llvm.inttoptr %1 : i64 to !llvm.ptr
+    %src_casted = llvm.addrspacecast %2 : !llvm.ptr to !llvm.ptr<1>
+    gpu.launch_func @kernel::@store_constant blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
+        args(%src_casted : !llvm.ptr<1>)
+    %dst = memref.alloc() : memref<8x16xf32>
+    gpu.memcpy %dst, %memref_0 : memref<8x16xf32>, memref<8x16xf32>
+    gpu.dealloc %memref_0 : memref<8x16xf32>
+
+    return %dst : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c11_f32 = arith.constant 11.11 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
+      }
+    }
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
+    %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
+    call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [11.11{{.*}}]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]
+    // CHECK-COUNT-96: 11.11
+    // CHECK-NEXT: [11.11{{.*}}]
+
+    memref.dealloc %A : memref<8x16xf32>
+    memref.dealloc %B : memref<8x16xf32>
+    return
+  }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index ce43941..ac334ea 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -86,12 +86,12 @@ llvm.func @kernel_func_unsafe_fp_atomics()
 }
 
 llvm.func @rocdl.lane_id() -> i32 {
-  // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-  // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
+  // CHECK: [[mbcntlo:%.+]] = call noundef range(i32 0, 32) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  // CHECK-NEXT: call noundef range(i32 0, 64) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
   %0 = llvm.mlir.constant(-1 : i32) : i32
   %1 = llvm.mlir.constant(0 : i32) : i32
-  %2 = rocdl.mbcnt.lo %0, %1 : (i32, i32) -> i32
-  %3 = rocdl.mbcnt.hi %0, %2 : (i32, i32) -> i32
+  %2 = rocdl.mbcnt.lo %0, %1 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
+  %3 = rocdl.mbcnt.hi %0, %2 {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 64>}]} : (i32, i32) -> i32
   llvm.return %3 : i32
 }
 
diff --git a/mlir/test/Target/SPIRV/debug-negative.mlir b/mlir/test/Target/SPIRV/debug-negative.mlir
new file mode 100644
index 0000000..2c82687
--- /dev/null
+++ b/mlir/test/Target/SPIRV/debug-negative.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-translate %s --test-spirv-roundtrip-debug --no-implicit-module --verify-diagnostics
+
+// expected-error@below {{SPV_KHR_non_semantic_info extension not available}}
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader], []> attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader], []>, #spirv.resource_limits<>>} {
+}
diff --git a/mlir/test/Target/SPIRV/debug.mlir b/mlir/test/Target/SPIRV/debug.mlir
index 58bf364..5a7ed19 100644
--- a/mlir/test/Target/SPIRV/debug.mlir
+++ b/mlir/test/Target/SPIRV/debug.mlir
@@ -1,69 +1,70 @@
 // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip-debug -mlir-print-debuginfo -mlir-print-local-scope %s | FileCheck %s
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv %s | spirv-val %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
-  // CHECK: loc({{".*debug.mlir"}}:5:3)
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, GroupNonUniformArithmetic], [SPV_KHR_non_semantic_info, SPV_KHR_storage_buffer_storage_class]> attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniformArithmetic], [SPV_KHR_non_semantic_info, SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>} {
+  // CHECK: loc({{".*debug.mlir"}}:6:3)
   spirv.GlobalVariable @var0 bind(0, 1) : !spirv.ptr<f32, Input>
   spirv.func @arithmetic(%arg0 : vector<4xf32>, %arg1 : vector<4xf32>) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:8:10)
+    // CHECK: loc({{".*debug.mlir"}}:9:10)
     %0 = spirv.FAdd %arg0, %arg1 : vector<4xf32>
-    // CHECK: loc({{".*debug.mlir"}}:10:10)
+    // CHECK: loc({{".*debug.mlir"}}:11:10)
     %1 = spirv.FNegate %arg0 : vector<4xf32>
     spirv.Return
   }
 
   spirv.func @atomic(%ptr: !spirv.ptr<i32, Workgroup>, %value: i32, %comparator: i32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:16:10)
+    // CHECK: loc({{".*debug.mlir"}}:17:10)
     %1 = spirv.AtomicAnd <Device> <None> %ptr, %value : !spirv.ptr<i32, Workgroup>
     spirv.Return
   }
 
   spirv.func @bitwiser(%arg0 : i32, %arg1 : i32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:22:10)
+    // CHECK: loc({{".*debug.mlir"}}:23:10)
     %0 = spirv.BitwiseAnd %arg0, %arg1 : i32
     spirv.Return
   }
 
   spirv.func @convert(%arg0 : f32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:28:10)
+    // CHECK: loc({{".*debug.mlir"}}:29:10)
     %0 = spirv.ConvertFToU %arg0 : f32 to i32
     spirv.Return
   }
 
   spirv.func @composite(%arg0 : !spirv.struct<(f32, !spirv.struct<(!spirv.array<4xf32>, f32)>)>, %arg1: !spirv.array<4xf32>, %arg2 : f32, %arg3 : f32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:34:10)
+    // CHECK: loc({{".*debug.mlir"}}:35:10)
     %0 = spirv.CompositeInsert %arg1, %arg0[1 : i32, 0 : i32] : !spirv.array<4xf32> into !spirv.struct<(f32, !spirv.struct<(!spirv.array<4xf32>, f32)>)>
-    // CHECK: loc({{".*debug.mlir"}}:36:10)
+    // CHECK: loc({{".*debug.mlir"}}:37:10)
     %1 = spirv.CompositeConstruct %arg2, %arg3 : (f32, f32) -> vector<2xf32>
     spirv.Return
   }
 
   spirv.func @group_non_uniform(%val: f32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:42:10)
+    // CHECK: loc({{".*debug.mlir"}}:43:10)
     %0 = spirv.GroupNonUniformFAdd <Workgroup> <Reduce> %val : f32 -> f32
     spirv.Return
   }
 
   spirv.func @local_var() "None" {
     %zero = spirv.Constant 0: i32
-    // CHECK: loc({{".*debug.mlir"}}:49:12)
+    // CHECK: loc({{".*debug.mlir"}}:50:12)
     %var = spirv.Variable init(%zero) : !spirv.ptr<i32, Function>
     spirv.Return
   }
 
   spirv.func @logical(%arg0: i32, %arg1: i32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:55:10)
+    // CHECK: loc({{".*debug.mlir"}}:56:10)
     %0 = spirv.IEqual %arg0, %arg1 : i32
     spirv.Return
   }
 
   spirv.func @memory_accesses(%arg0 : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, StorageBuffer>, %arg1 : i32, %arg2 : i32) "None" {
-    // CHECK: loc({{".*debug.mlir"}}:61:10)
+    // CHECK: loc({{".*debug.mlir"}}:62:10)
     %2 = spirv.AccessChain %arg0[%arg1, %arg2] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, StorageBuffer>, i32, i32 -> !spirv.ptr<f32, StorageBuffer>
-    // CHECK: loc({{".*debug.mlir"}}:63:10)
+    // CHECK: loc({{".*debug.mlir"}}:64:10)
     %3 = spirv.Load "StorageBuffer" %2 : f32
-    // CHECK: loc({{.*debug.mlir"}}:65:5)
+    // CHECK: loc({{.*debug.mlir"}}:66:5)
     spirv.Store "StorageBuffer" %2, %3 : f32
-    // CHECK: loc({{".*debug.mlir"}}:67:5)
+    // CHECK: loc({{".*debug.mlir"}}:68:5)
     spirv.Return
   }
 
@@ -73,49 +74,49 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %ivar = spirv.Variable init(%zero) : !spirv.ptr<i32, Function>
     %jvar = spirv.Variable init(%zero) : !spirv.ptr<i32, Function>
     spirv.mlir.loop {
-      // CHECK: loc({{".*debug.mlir"}}:75:5)
+      // CHECK: loc({{".*debug.mlir"}}:76:5)
       spirv.Branch ^header
     ^header:
       %ival0 = spirv.Load "Function" %ivar : i32
       %icmp = spirv.SLessThan %ival0, %count : i32
-      // CHECK: loc({{".*debug.mlir"}}:75:5)
+      // CHECK: loc({{".*debug.mlir"}}:76:5)
       spirv.BranchConditional %icmp, ^body, ^merge
     ^body:
       spirv.Store "Function" %jvar, %zero : i32
       spirv.mlir.loop {
-        // CHECK: loc({{".*debug.mlir"}}:85:7)
+        // CHECK: loc({{".*debug.mlir"}}:86:7)
         spirv.Branch ^header
       ^header:
         %jval0 = spirv.Load "Function" %jvar : i32
         %jcmp = spirv.SLessThan %jval0, %count : i32
-        // CHECK: loc({{".*debug.mlir"}}:85:7)
+        // CHECK: loc({{".*debug.mlir"}}:86:7)
         spirv.BranchConditional %jcmp, ^body, ^merge
       ^body:
-        // CHECK: loc({{".*debug.mlir"}}:95:9)
+        // CHECK: loc({{".*debug.mlir"}}:96:9)
         spirv.Branch ^continue
       ^continue:
         %jval1 = spirv.Load "Function" %jvar : i32
         %add = spirv.IAdd %jval1, %one : i32
         spirv.Store "Function" %jvar, %add : i32
-        // CHECK: loc({{".*debug.mlir"}}:101:9)
+        // CHECK: loc({{".*debug.mlir"}}:102:9)
         spirv.Branch ^header
       ^merge:
-        // CHECK: loc({{".*debug.mlir"}}:85:7)
+        // CHECK: loc({{".*debug.mlir"}}:86:7)
         spirv.mlir.merge
-        // CHECK: loc({{".*debug.mlir"}}:85:7)
+        // CHECK: loc({{".*debug.mlir"}}:86:7)
       }
-      // CHECK: loc({{".*debug.mlir"}}:108:7)
+      // CHECK: loc({{".*debug.mlir"}}:109:7)
       spirv.Branch ^continue
     ^continue:
       %ival1 = spirv.Load "Function" %ivar : i32
       %add = spirv.IAdd %ival1, %one : i32
       spirv.Store "Function" %ivar, %add : i32
-      // CHECK: loc({{".*debug.mlir"}}:114:7)
+      // CHECK: loc({{".*debug.mlir"}}:115:7)
       spirv.Branch ^header
     ^merge:
-      // CHECK: loc({{".*debug.mlir"}}:75:5)
+      // CHECK: loc({{".*debug.mlir"}}:76:5)
       spirv.mlir.merge
-    // CHECK: loc({{".*debug.mlir"}}:75:5)
+    // CHECK: loc({{".*debug.mlir"}}:76:5)
     }
     spirv.Return
   }
@@ -126,21 +127,23 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %two = spirv.Constant 2: i32
     %var = spirv.Variable init(%zero) : !spirv.ptr<i32, Function>
     spirv.mlir.selection {
-      // CHECK: loc({{".*debug.mlir"}}:128:5)
+      // CHECK: loc({{".*debug.mlir"}}:129:5)
       spirv.BranchConditional %cond [5, 10], ^then, ^else
     ^then:
       spirv.Store "Function" %var, %one : i32
-      // CHECK: loc({{".*debug.mlir"}}:134:7)
+      // CHECK: loc({{".*debug.mlir"}}:135:7)
       spirv.Branch ^merge
     ^else:
       spirv.Store "Function" %var, %two : i32
-      // CHECK: loc({{".*debug.mlir"}}:138:7)
+      // CHECK: loc({{".*debug.mlir"}}:139:7)
       spirv.Branch ^merge
     ^merge:
-      // CHECK: loc({{".*debug.mlir"}}:128:5)
+      // CHECK: loc({{".*debug.mlir"}}:129:5)
       spirv.mlir.merge
-    // CHECK: loc({{".*debug.mlir"}}:128:5)
+    // CHECK: loc({{".*debug.mlir"}}:129:5)
     }
     spirv.Return
   }
+
+  spirv.EntryPoint "GLCompute" @local_var
 }
diff --git a/mlir/test/Target/SPIRV/mlir-translate.mlir b/mlir/test/Target/SPIRV/mlir-translate.mlir
index 9f91fc9..cbce351 100644
--- a/mlir/test/Target/SPIRV/mlir-translate.mlir
+++ b/mlir/test/Target/SPIRV/mlir-translate.mlir
@@ -5,7 +5,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t && mlir-translate --serialize-spirv --no-implicit-module \
 // RUN: --split-input-file --spirv-save-validation-files-with-prefix=%t/foo %s \
-// RUN: && ls %t | wc -l | FileCheck %s
+// RUN: && ls %t/foo*.spv | wc -l | FileCheck %s
 // RUN: rm -rf %t
 
 // CHECK: 4
diff --git a/mlir/test/Target/SPIRV/module.mlir b/mlir/test/Target/SPIRV/module.mlir
index dcdcab8..d4000df 100644
--- a/mlir/test/Target/SPIRV/module.mlir
+++ b/mlir/test/Target/SPIRV/module.mlir
@@ -1,21 +1,29 @@
-// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
+
+// REQUIRES: shell
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ ls %t/module*.spv | xargs -I{} spirv-val {} %}
 
 // CHECK:      spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 // CHECK-NEXT:   spirv.func @foo() "Inline" {
 // CHECK-NEXT:     spirv.Return
 // CHECK-NEXT:   }
+// CHECK-NEXT:   spirv.EntryPoint "Vertex" @foo
 // CHECK-NEXT: }
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   spirv.func @foo() -> () "Inline" {
      spirv.Return
   }
+  spirv.EntryPoint "Vertex" @foo
 }
 
 // -----
 
 // CHECK: v1.5
-spirv.module Logical GLSL450 requires #spirv.vce<v1.5, [Shader], []> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.5, [Shader, Linkage], []> {
 }
 
 // -----
@@ -26,13 +34,13 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.6, [Shader, Linkage], []> {
 
 // -----
 
-// CHECK: [Shader, Float16]
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Float16], []> {
+// CHECK: [Shader, Float16, Linkage]
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Float16, Linkage], []> {
 }
 
 // -----
 
 // CHECK: [SPV_KHR_float_controls, SPV_KHR_subgroup_vote]
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], [SPV_KHR_float_controls, SPV_KHR_subgroup_vote]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], [SPV_KHR_float_controls, SPV_KHR_subgroup_vote]> {
 }
 
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 5630d15..9a04da7 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -1,9 +1,14 @@
-// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns -verify-diagnostics -profile-actions-to=- %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s
+
+// CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B"
+// CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B"
+// CHECK-PROFILER: "name": "apply-pattern", "cat": "PERF", "ph": "B"
+// CHECK-PROFILER: "name": "apply-pattern", "cat": "PERF", "ph": "E"
+// CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "E"
+// CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "E"
 
-//      CHECK: "name": "pass-execution", "cat": "PERF", "ph": "B"
-//      CHECK: "name": "apply-conversion", "cat": "PERF", "ph": "B"
-//      CHECK: "name": "apply-pattern", "cat": "PERF", "ph": "B"
-//      CHECK: "name": "apply-pattern", "cat": "PERF", "ph": "E"
 // Note: Listener notifications appear after the pattern application because
 // the conversion driver sends all notifications at the end of the conversion
 // in bulk.
@@ -11,8 +16,6 @@
 // CHECK-NEXT: notifyOperationReplaced: test.illegal_op_a
 // CHECK-NEXT: notifyOperationModified: func.return
 // CHECK-NEXT: notifyOperationErased: test.illegal_op_a
-//      CHECK: "name": "apply-conversion", "cat": "PERF", "ph": "E"
-//      CHECK: "name": "pass-execution", "cat": "PERF", "ph": "E"
 // CHECK-LABEL: verifyDirectPattern
 func.func @verifyDirectPattern() -> i32 {
   // CHECK-NEXT:  "test.legal_op_a"() <{status = "Success"}
@@ -29,7 +32,9 @@ func.func @verifyDirectPattern() -> i32 {
 // CHECK-NEXT: notifyOperationErased: test.illegal_op_c
 // CHECK-NEXT: notifyOperationInserted: test.legal_op_a, was unlinked
 // CHECK-NEXT: notifyOperationReplaced: test.illegal_op_e
-// CHECK-NEXT: notifyOperationErased: test.illegal_op_e
+// Note: func.return is modified a second time when running in no-rollback
+//       mode.
+//      CHECK: notifyOperationErased: test.illegal_op_e
 
 // CHECK-LABEL: verifyLargerBenefit
 func.func @verifyLargerBenefit() -> i32 {
@@ -70,7 +75,7 @@ func.func @remap_call_1_to_1(%arg0: i64) {
 // CHECK:      notifyBlockInserted into func.func: was unlinked
 
 // Contents of the old block are moved to the new block.
-// CHECK-NEXT: notifyOperationInserted: test.return, was linked, exact position unknown
+// CHECK-NEXT: notifyOperationInserted: test.return
 
 // The old block is erased.
 // CHECK-NEXT: notifyBlockErased
@@ -409,8 +414,10 @@ func.func @test_remap_block_arg() {
 
 // CHECK-LABEL: func @test_multiple_1_to_n_replacement()
 //       CHECK:   %[[legal_op:.*]]:4 = "test.legal_op"() : () -> (f16, f16, f16, f16)
-//       CHECK:   %[[cast:.*]] = "test.cast"(%[[legal_op]]#0, %[[legal_op]]#1, %[[legal_op]]#2, %[[legal_op]]#3) : (f16, f16, f16, f16) -> f16
-//       CHECK:   "test.valid"(%[[cast]]) : (f16) -> ()
+// Note: There is a bug in the rollback-based conversion driver: it emits a
+// "test.cast" : (f16, f16, f16, f16) -> f16, when it should be emitting
+// three consecutive casts of (f16, f16) -> f16.
+//       CHECK:   "test.valid"(%{{.*}}) : (f16) -> ()
 func.func @test_multiple_1_to_n_replacement() {
   %0 = "test.multiple_1_to_n_replacement"() : () -> (f16)
   "test.invalid"(%0) : (f16) -> ()
diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
index 418c884..882d5ab 100644
--- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
@@ -30,6 +30,7 @@ set(LIBS
   MLIRVectorDialect
   MLIRVectorToLLVMPass
   MLIRXeVMDialect
+  MLIRXeVMToLLVMIRTranslation
   )
 
 add_mlir_library(MLIRGPUTestPasses
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index ff958d9..657dfd2 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1177,8 +1177,8 @@ struct TestNonRootReplacement : public RewritePattern {
     auto illegalOp = ILLegalOpF::create(rewriter, op->getLoc(), resultType);
     auto legalOp = LegalOpB::create(rewriter, op->getLoc(), resultType);
 
-    rewriter.replaceOp(illegalOp, legalOp);
     rewriter.replaceOp(op, illegalOp);
+    rewriter.replaceOp(illegalOp, legalOp);
     return success();
   }
 };
@@ -1362,6 +1362,7 @@ public:
     // Helper function that replaces the given op with a new op of the given
     // name and doubles each result (1 -> 2 replacement of each result).
     auto replaceWithDoubleResults = [&](Operation *op, StringRef name) {
+      rewriter.setInsertionPointAfter(op);
       SmallVector<Type> types;
       for (Type t : op->getResultTypes()) {
         types.push_back(t);
@@ -1560,6 +1561,7 @@ struct TestLegalizePatternDriver
     if (mode == ConversionMode::Partial) {
       DenseSet<Operation *> unlegalizedOps;
       ConversionConfig config;
+      config.allowPatternRollback = allowPatternRollback;
       DumpNotifications dumpNotifications;
       config.listener = &dumpNotifications;
       config.unlegalizedOps = &unlegalizedOps;
@@ -1582,6 +1584,7 @@ struct TestLegalizePatternDriver
       });
 
       ConversionConfig config;
+      config.allowPatternRollback = allowPatternRollback;
       DumpNotifications dumpNotifications;
       config.foldingMode = foldingMode;
       config.listener = &dumpNotifications;
@@ -1599,6 +1602,7 @@ struct TestLegalizePatternDriver
     DenseSet<Operation *> legalizedOps;
     ConversionConfig config;
     config.foldingMode = foldingMode;
+    config.allowPatternRollback = allowPatternRollback;
     config.legalizableOps = &legalizedOps;
     if (failed(applyAnalysisConversion(getOperation(), target,
                                        std::move(patterns), config)))
@@ -1634,6 +1638,9 @@ struct TestLegalizePatternDriver
                                   "after-patterns",
                                   "Only attempt to fold not legal operations "
                                   "after applying patterns"))};
+  Option<bool> allowPatternRollback{*this, "allow-pattern-rollback",
+                                    llvm::cl::desc("Allow pattern rollback"),
+                                    llvm::cl::init(true)};
 };
 } // namespace
 
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index d904780..0da7689 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -33,6 +33,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.run_xevm_tests = @MLIR_ENABLE_XEVM_CONVERSIONS@
 config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
 config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index f5365ca..0f0e96c 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -765,7 +765,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
     return Error::success();
   }
   case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
-    auto &Global = Program->KernelSymbols[Name];
+    auto &Global = Program->GlobalSymbols[Name];
     if (!Global) {
       GlobalTy GlobalObj{Name};
       if (auto Res =
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index e94f3f6..82c9f9b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -914,9 +914,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuStreamWaitEvent: %s");
   }
 
-  // TODO: This should be implementable on CUDA
   Expected<bool> hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfo) override {
-    return true;
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfo, Stream))
+      return Err;
+
+    CUresult Ret = cuStreamQuery(Stream);
+    if (Ret == CUDA_SUCCESS)
+      return false;
+
+    if (Ret == CUDA_ERROR_NOT_READY)
+      return true;
+
+    return Plugin::check(Ret, "error in cuStreamQuery: %s");
   }
 
   /// Synchronize the current thread with the event.
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 38f7e3b..adc0ee8 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1954,16 +1954,6 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "sincosf16_utils",
-    hdrs = ["src/math/generic/sincosf16_utils.h"],
-    deps = [
-        ":__support_common",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-    ],
-)
-
-libc_support_library(
     name = "explogxf",
     hdrs = ["src/math/generic/explogxf.h"],
     deps = [
@@ -2391,6 +2381,20 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_math_cosf16",
+    hdrs = ["src/__support/math/cosf16.h"],
+    deps = [
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_cast",
+        ":__support_fputil_except_value_utils",
+        ":__support_macros_optimization",
+        ":__support_math_sincosf16_utils",
+        ":errno",
+    ],
+)
+
+libc_support_library(
     name = "__support_math_erff",
     hdrs = ["src/__support/math/erff.h"],
     deps = [
@@ -2704,6 +2708,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_sincosf16_utils",
+    hdrs = ["src/__support/math/sincosf16_utils.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -3167,9 +3181,7 @@ libc_math_function(
 libc_math_function(
     name = "cosf16",
     additional_deps = [
-        ":__support_fputil_multiply_add",
-        ":__support_macros_optimization",
-        ":sincosf16_utils",
+        ":__support_math_cosf16",
     ],
 )
 
@@ -3214,7 +3226,7 @@ libc_math_function(
     additional_deps = [
         ":__support_fputil_multiply_add",
         ":__support_macros_optimization",
-        ":sincosf16_utils",
+        ":__support_math_sincosf16_utils",
     ],
 )
 
@@ -4292,7 +4304,7 @@ libc_math_function(
     additional_deps = [
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":sincosf16_utils",
+        ":__support_math_sincosf16_utils",
     ],
 )
 
@@ -4352,7 +4364,7 @@ libc_math_function(
     additional_deps = [
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":sincosf16_utils",
+        ":__support_math_sincosf16_utils",
     ],
 )
 
@@ -4421,7 +4433,7 @@ libc_math_function(
     additional_deps = [
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":sincosf16_utils",
+        ":__support_math_sincosf16_utils",
     ],
 )
 
@@ -4461,7 +4473,7 @@ libc_math_function(
 libc_math_function(
     name = "tanpif16",
     additional_deps = [
-        ":sincosf16_utils",
+        ":__support_math_sincosf16_utils",
         ":hdr_errno_macros",
         ":hdr_fenv_macros",
         ":__support_fputil_cast",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 49694a2..6a15b0c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8,6 +8,7 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+load("//llvm:targets.bzl", "llvm_targets")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -4133,6 +4134,7 @@ cc_library(
         ":TosaToTensor",
         ":UBToLLVM",
         ":UBToSPIRV",
+        ":VectorToAMX",
         ":VectorToArmSME",
         ":VectorToGPU",
         ":VectorToLLVM",
@@ -5508,6 +5510,7 @@ cc_library(
         ":VCIXToLLVMIRTranslation",
         ":VectorDialect",
         ":XeVMDialect",
+        ":XeVMTarget",
         ":config",
         "//llvm:Core",
         "//llvm:MC",
@@ -9102,6 +9105,7 @@ cc_library(
         ":X86VectorDialect",
         ":XeGPUDialect",
         ":XeVMDialect",
+        ":XeVMTarget",
     ],
 )
 
@@ -9196,6 +9200,7 @@ cc_library(
         ":VectorToLLVM",
         ":VectorTransformOps",
         ":XeVMToLLVM",
+        ":XeVMToLLVMIRTranslation",
     ],
 )
 
@@ -11133,6 +11138,32 @@ cc_library(
 )
 
 cc_library(
+    name = "VectorToAMX",
+    srcs = glob([
+        "lib/Conversion/VectorToAMX/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/VectorToAMX/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":AMXDialect",
+        ":AffineDialect",
+        ":AffineUtils",
+        ":ArithDialect",
+        ":ConversionPassIncGen",
+        ":DialectUtils",
+        ":IR",
+        ":LinalgInterfaces",
+        ":MemRefDialect",
+        ":Pass",
+        ":SCFDialect",
+        ":TransformUtils",
+        ":VectorDialect",
+    ],
+)
+
+cc_library(
     name = "VectorToGPU",
     srcs = glob([
         "lib/Conversion/VectorToGPU/*.cpp",
@@ -13791,6 +13822,34 @@ gentbl_cc_library(
 )
 
 cc_library(
+    name = "XeVMTarget",
+    srcs = ["lib/Target/LLVM/XeVM/Target.cpp"],
+    hdrs = glob(["include/mlir/Target/LLVM/XeVM/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":ExecutionEngineUtils",
+        ":GPUDialect",
+        ":GPUToLLVMIRTranslation",
+        ":IR",
+        ":LLVMToLLVMIRTranslation",
+        ":Support",
+        ":TargetLLVM",
+        ":ToLLVMIRTranslation",
+        ":XeVMDialect",
+        ":XeVMToLLVMIRTranslation",
+        "//llvm:BitWriter",
+        "//llvm:Core",
+        "//llvm:Object",
+        "//llvm:Support",
+        "//llvm:Target",
+        "//llvm:config",
+    ] + ([
+        "//llvm:SPIRVCodeGen",
+        "//llvm:SPIRVUtilsAndDesc",
+    ] if "SPIRV" in llvm_targets else []),
+)
+
+cc_library(
     name = "XeVMToLLVM",
     srcs = glob([
         "lib/Conversion/XeVMToLLVM/*.cpp",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 27b1dbb..6cfd8f4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -47,6 +47,7 @@ expand_template(
         "@MLIR_ENABLE_SPIRV_CPU_RUNNER@": "0",
         "@MLIR_ENABLE_VULKAN_RUNNER@": "0",
         "@MLIR_ENABLE_BINDINGS_PYTHON@": "0",
+        "@MLIR_ENABLE_XEVM_CONVERSIONS@": "0",
         "@MLIR_RUN_AMX_TESTS@": "0",
         "@MLIR_RUN_ARM_SVE_TESTS@": "0",
         "@MLIR_RUN_ARM_SME_TESTS@": "0",
author	Mehdi Amini <joker.eph@gmail.com>	2025-08-14 15:36:46 +0200
committer	GitHub <noreply@github.com>	2025-08-14 15:36:46 +0200
commit	df57d6a01e85ca78da2febab21b268d9fd6955a0 (patch)
tree	19b0aab453e6bc7e2b15d3220024dfdacd4fa57e
parent	df86ea61b7ed484ca797f96d7ad40fd9ada7ba30 (diff)
parent	7bda76367f19cfc19086f68d9dd5ac019a9ceccd (diff)
download	llvm-users/joker-eph-python-bindings-maintainers.zip llvm-users/joker-eph-python-bindings-maintainers.tar.gz llvm-users/joker-eph-python-bindings-maintainers.tar.bz2